From 578628c209e0775959dffc81de102519328f0eb1 Mon Sep 17 00:00:00 2001 From: Raphael Date: Fri, 31 Jan 2025 21:33:04 -0800 Subject: [PATCH 1/3] Create index.php --- test/index.php | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 test/index.php diff --git a/test/index.php b/test/index.php new file mode 100644 index 0000000..cd00388 --- /dev/null +++ b/test/index.php @@ -0,0 +1,5 @@ + From d274e398eb8666a79bffb0f5329cfa44f2dda492 Mon Sep 17 00:00:00 2001 From: Raphael Date: Sun, 6 Apr 2025 20:42:09 -0700 Subject: [PATCH 2/3] Create asdds.py --- asdds.py | 1193 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1193 insertions(+) create mode 100644 asdds.py diff --git a/asdds.py b/asdds.py new file mode 100644 index 0000000..c77effa --- /dev/null +++ b/asdds.py @@ -0,0 +1,1193 @@ +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx From 71230670b49ed628624cabc3bf81a882b5323210 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 6 Apr 2025 22:06:04 -0700 Subject: [PATCH 3/3] new pr stuff --- .asdds.py.swp | Bin 0 -> 884736 bytes asdds.py | 857767 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 857767 insertions(+) create mode 100644 .asdds.py.swp diff --git a/.asdds.py.swp b/.asdds.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..917e0d3493daf4bf1a8a6a33d09936bc6fe31347 GIT binary patch literal 884736 zcmeFa2bi2!dHy}driEszF(ten0UFt>QL*K+whXdl*|IH5sMy%T%*^i2uEyG(S7ZM&pGF9GrK#I<&yuWxvqO>=RM~= zy*%waXJq*W8#YF#lvfn^99bxQ{oipNCvzSs8sPUg@Hhg#rJ3mj^JLoINq1rD{q zp%ysQ0*6}QPz(IuZGrCe;f24V_^GEydMlSqr{&}e12Xo;Sci9+x~m4A8;Z42m9yK{`-x&a6fGdkMZHh`T-fq0*vLi!K$;h}%tl?#8k4-fUb zpPw+{{lz}K=D!=c@REN%;lHoXg)i~X!~A%dpKu}lV|{oi|JAwh<9v81-yQsf4DS#5 z@X&sb&4n-Z;bDHfITv2`;bDH<-%sd}{xTn4_1~Rb_;UX|%#Tmz!cBfloY4Nu{DKhD zU+Lcu^W*8c@Z)`Wm>=KGg`eQVtN#1=d%1r?`m6l&w*P*9F8oCQeA<5><`*Z>!cX?$Tm1J|bK$4>=V5*v>ldt${;&@Z^W*8c@Kb$w=-;2_!cX(zVSb$D7rc=E zY9AiP*K2a&r~B~GKX>yBVtD@%K0MTKXD5r@Bhk$Z}QK>{MhO@oRI!zA0Fn%4Y}|~`S8#`%lw8E-Ve{i{J1g~zSX}U=EqlZ z;oE$8=-)^FrF#(4-|oZ1{CH6=`~n{y#@Fw2;h|ka|5W^j7}5{VLp|P^3lHV0`F!pc zxqrg@VO)jzF_jAs{eG2y|N2~bX!lV5HOFMr5A7Pt_u5=|crTRmc78(+<*WF7!~EEu z3$Oa{P>*-#!fQS}#5>Aw=pp?uFT!^x7e4CW5AFP!TzKf$&`#xtx)(zFq1>S#pOy;` z<0_Qz3%T&nFKu6rQy!L0f2V&Q=Eo~@;SC=i`YC#N_WetJc<7(U=fZdS@X(Jpp(GxtOeM7xN|GYUDewlwi)bFmIAPVn? zcCGn%m0WnoKM(!z-duRshlly`Ku=JG^k;l{7+=rJh5y-q!Z-`*g!ljC!^1lKO`l!} z|Dz8N=@0pU5dH@r9@_WWx$xin@X&t$n+yM)4-e(b^N+vHrT_ZdxdMj#f8)c${J7`g z+3;Wc@URX}=E8sF!^1l8{#^JkeR$Qk%P~h})A!K}zS|0q&xQNfdcJBj*dutlOZ7Rj zP_|2ud$cq4c*m;o1n?chwX7#s9ce4@46quU295&1xdS#8;4R=Ez;nP}FbggRdq5Rz0?WZyZ(k^U z1zZo_23`YR0-ggN2S&gOa3^p_@H_hXyWl$Ta_}OjSjpa*WZ&qrZ zO0(Wk)Un2BHzbsf5}wtkrn|G1(c0+FdU&LIP1oBM>{UkVoleMYyj>gZHd@UPZDO!! zN*aro&^)bnrQWQKOvbtOhL6=J>)m<6t)VGJMQK(l(dtM`(0IEw6_rC_cQ>wx8dKA) zb~kdhj%uCA2M$G@`h=O}*u(IHg+v|O=u{fbMz>L$Y+O+vb6rrGtX(l{kNt1ey}5g8 zvV!?eW3*1tSpD);*c^{*Q)8W}M(!n7@KD;4uSvJn9^F~)&Q8}mA!wcDL23 zWE@@@o0*!Lt>~Rpr0{lSZNWE)y&JNbomS`6FYnfy9XdBoY^*-ruG?_-X<5XO*qY|- zP;_3STW{AUhoVi>8W;L+dy{|M3yo%Hx<1+sZ*&?H%^K6E9!jyN)^3t@C;aZr#{W(? zMt4od5l5{NLbTCVvrB0w8zae+$w@L&;aLQjr`vWmI?;4(bXRSn9@Q|diaImXHWbEN z?Z|aa7!^~sCb2XaB3ftC{;Q8gjb^lV`?@t{&3X-rMbXsk^z3k?p|@x0OjAV~!**-D zW5LPzC@m@QG^*@ub-KxrFD)5ngu3xn*;t?2+-gm(brtc0#y|AGMrX!ql{V9$)~PFQ z`f$2K#wEl$UE7*wmgh=IztuB?yj$$QW;^HMt%hZ4@^@-xz!~IpdKFa(Rbp0`B5f*qk zU#f!g?I}T@_E{j?esA>6t6#yh1ENpM#fH$0I@}5bXehQ?b#4NJZ;vdJ3Cv+ zKA#XJwy~%4R4TcLNo)}!?cJhaT)X-7dmblm`0Dk%((^n; zo_L{Zt~@F7E!!wP!ShvUu)6e%m3}ry4bo9U;SUjQhlsXAMB98s+dKrD`2SDC*FFm# zSp5GIzVvw`y!}6btHJr;Sg-_$=l>6QdGY$!fER!kI29ZV?g#z^|Nckt2k=dBHMkgD z1U7-&fzQLgKM9-+4hOdbzk_H0GWakMKfeo{2$q6}fO~-(DCgBc<$b2geCTti1rD{q zp%ysQ0*6}QPzxMtfkQ3uf42oBp3n!GcQo3)ykI~0(<4wTWkkx6BpKz#SR%S|FU&9g znlBB+L9n>9PT8SS%r@$iV+H-!ZTrVAk3O=l`4Ucw=%hsw)s%xk#Z{`bBtq)s1+&O* zrFDtwlL)0uLXxhz&&eg44nW*kveMFu{J$|HWG$(=+&hx!T0GTEA95a+q-(h9lB=Qt zh^keUp<0c|Z@e}$*MU@ zC@o)UFJ|+Y}bm#Wif=xM~%jv-OH^G_tEDNsnAADLx5|)T61|7!q}4?YrwyM@iPYme`v1LJQ7l;LNVeu1&PH@W!t=DW&U$V*lxE0Rf7mG+cC0gv z$dFn~(pL2uvdKp#I)bwQ*O!BJLS@v)tLQpjbXKs3{!e}~_RI~%3vZHds-uIn#wUw9 zz^7?nea9wUJDx;~C!1af>W7d${nvg=sqgthZS&I!Q4$pq6Q)GU}q!p}Xr5 z$xo9*Yut@I%?b564S(581MylkUYlx6HfrrgcQ)G7=c3?2?12L6l;;OpQTa1l5P z{2ZSDC*Wlu$OInD{|^GchX0ox^n1bc!Bt=y>;M;mM}rH&Bf&Co7$|^0Bk%YQklf?V z;ML&8;8|b-jDtG37x)Ca{f~oZgLA-r!MD-te+zsOTmvowi@^2h@1F*q3a$WU@BnZx z@OyOiH-J|G>GCfDi^1XG>*(b_44wtf0gJ(d!Ee#SzX7}+TnnBA)&c41|3H7e1^g3u zId~!13bp|CXg4)utY%WF=kA-eE}6J{D(%`Hmu9Jh3LkC1 zHHTYb-9lp>Y+IgNW&%}_P(R6ntjvqHPt`9+J)?TL1f5Dy!TJ=dsa?QnWUCQNWtE{& zrSvR~9R!woy$1EhQ?|ui5*wPku{s!wq+PZZJMPE{b0IPmP0e(=+K7`E+qDLC(l*{W zv)yjB9S!7lcnx}i<*zwfFS(R#r=(d`V10c0tNlNaW$2yV!-B#|r%xKGin_R+@>bWx zeNs+E4n&`ODYSXW$UPPDl^ET)-eHHGpAJ+}x&F}jW1#gXB;8x(XoLDVA$ZBL zSx||>Cnkl{MYLJ%o4lty{`vmXv?>aA7|y0b`D^}$#k4syHNuo(!ylRLLOwW()UO?( zIZ_MgKnt2%a9Gil8wO5O^*ycju1-1fs|l;4U&N`UH^^Xq?u3+`*y&yhbPW+SsWAj1 zf>F!rC7Z@J*;|-Y)kdIr0=d(oLEDywUy(^6u8zGINMqliTG}CfY^n`W2pY;>7iF}$ z-e$b8=7;k7ltghI;9_hk_$AbpU>>qNtg@>;4W1BkZa&7l=}Z$VIqclvN-p;sX%Bhb zhxN>*l1W2*NkuzjNOb@fVD9at-&TLfzQ=Zkltt29HXZ3GZoN%*(nIz=vRv!aa?JnD zuqxgn%-iTBBR;XR@)74w;*fpMA^V;~_C4(7#U2Bm*uF>nzs{U|P5dJK|HXXi^KbC@ z;{E>~JO=Cl7lR0V3BLY1@MQ2L@I+7n%fLgyZ{g*|_rC;e2ET@n|04JT_ym{*%fN5p z;eP~P0Ga!Bt>4sDtyt zG2nOb=pP0j0v`mg0gnZD2Y(Qc4t@gu32XsJfd7O)|0sAfcoujjcm`Mp{u>_sXW*Z} zK5!+N0OR03;0E~g-C!Fy0(=a=*Ni3K-W(MwCJ zp4_$acsr_t-2ibb^|nA9bn`TUPd-Q*IRqiXeYB3qI+nWl!Qi6?;fos&L404c%2Xb_R_+1VJ~DWL7 z;~mr$n@~kavKor&h(&moun1d%@HG2QUja=jr9Pa96W_R2L|nj870=Ai!yqh)mYyCd z=N&LGLlFlEE|Gz*765$`YR4!|GjNn(r|x+KlSHHsX2_&oFJ}1m`2^fv7a>!B*TQ<6 zHcLd{bU-v(ZzCV)8P2ajH}&9*kurGO}BY=n~9sEvs-)WyX({)9(SrX z8||!5PSeF>GowUo*X%@!sYMEw7=QY(uqn&WYBbiWcbbdgn46;r{15|)MNd{CYGz60 zL=M&F?#5W7wv>}7ekF(}LQv|EP>bY7Yv{A;CT&pL7{hAqRhyokY&ac`2^@DLB~pDG zP0P@U>h1J`INE5Y+2Zihbb}e_cHz)cer27s#)zk<;vgq88DXlvXbBlYPB`R8wLg!Q zmDgfaz0pLR>-~YHjHJo)(zk?2DZ@3YPcviHNnyjGDoN&|wv5fADDIwDW>RHjT)Q%ZalmB%_5_$!sphK-lPOE8VzUF<8xS{(0*OAAd6H_kSe49D+5CFNwpRjYqB$y3KUQexcj z>QH16{W^MEnHL+H(L}xJ&Xs97vmqbVk|h;6Pgp+K0Vhqd!YK_~`&{JF8i&6Sv$<)J ztbSom==;{17s@+yK*_g%v_;3pII2TwM1GDAp($cDcY~AIQXoTMs7j(%vM+{$$6Xlq2wy~wvUVHn zBIFd>!EmlHy0>%xmsYO7K8 zLV3a(F%O+%yVe3HXiIYwKB9is<`r7*z--BSBvxW%A-NQ>>6cD`1t%UGjhbciNW zUplWGZ8DP;uX?caj{Q}0=`svQBhsv|j<&UD>Zzu3O`KbC`_@P*x}37FpoY%uknzz> zH!u`F5Iwelp@aDUbLoNCdHz56k9Zw?{xgBj`Tr4~{$)UR{!K6hjsp(@4+LL_$A1}^ z044C>@b*6gUjttS*Mh6SBft?r_WtjHzkeNgEvSN%!0+JiUj?><6Tvs&=RXcU24;ZH z-`@xy|19uKupZnVyaoP!4>$oF4{n5Se-bzz+#TEvyaj%}3d&$9SOji_N52Ye0B3_? z@KgBo8^M>r$AI|vHQ*Gm1eCz<;n%+e-T__$c7sQPmEg|cPTbSd=CKj$+Bc5erJr-eE+#)x|L-xdT<6dRXmtx+Tdr zulHT{We8g|IHeNW+VRHaD9%PMU6Kf}Ws;U`(t1swIcnmPh>nYP4Mn??H1vivdwOa_ z^~j}V&QVWw!VFe~@Cj+Gj*8_(<*QIgiOG7?mvc#UdbB(#nDI)n=9en4Y{7WN8$H^R z{*9lmwKxB9?Lx!u@JA{AfoPRss$SCx?(rE8ew%IzX16xEr#35&TgQi4+c|ZAMZMih zX7Giw^l`RI{y^lgRtP7YSkswpj-IC9!G@{c+`YTj?wnp_ShS`bGY#WRPEa)kl_J^S z?WZ2y3i?~xeE+Qo6L#H(gjQ+K5qWj?Ep1fjSqN*$6jc$0z1E~E2)V}0j6o;syR8;R z36eCx+U~5^Vzsp2=4I_=jwodDGRk(3fNZoihE>LBYiimoRAiTEW*LDX8FP{$ZR-dC zMt3pTtwdgc<`XEh$X!)vsQiFXz`<1NuoY9IA(hK0`8 zhu{6!f20nK^loEHN^UX@ElcirxQNT@ne&~E3Dn3$tTnAJ+cbzA0TFtQ)y7B08gjfb zffEMEyGAWK^ZX0c$d+WFseLEV)6!={M2w-Nwv^LSRh4O}Y3KJ}WV&tk@q zsn{E3*RX6kwMp2OXwBFdjz{E4!&^_9l>;_jh!4Gq6N4t(UiOO$OKy6c=`Z_rdrFIDzvF-1>A6tYGGVz^Us1O ztp}2QtAD08qWa~U47IuQwhZHHXkU~~n>Wtt^y^XF5BrJ)qdyp6QMYZdHZc z=(Za;GtlHndSDzeDT6Gnod{*sdWQeCh|j2$Dy$f8cyEBM^Rq-tm)6J8tdJcpRb&X} z>9DOxI+<%mIGB5v$eP3@tTLJoN2Q|g;aiYtPLeD_Ha1`{Q>Cglg;|mbPkP#SVtuUd z>#<07uGghZ(wi(Fjj5R_&u#}He?Bu$7U+dQAvpd#|AIByMk}hB5qdH%ib_yoMBX&o zZOM#kLGP`0P9 zo+sZ#0XRp#7NKvxDmI0-=4wrixV0=+!=#MC!|JHop%%14%M>HscI#;*#+l^d`H^OQ z;Ku!+zkaJ<&U$MF(%zD)X4xVhM0bPnM*es zdS^d0mB}0odCXB7zwFLgsGF)KV}A2V8YiQzP37vI9~_cT{C@+k^q26d(*Fnlmp_H? z{|R_Mcptb9Tng?9egV(_EpRQk3hV<{f~DZI@c*9y`@nHPHvNABzW}cTo52IY1HfH@ z{P$f8UICh5IamhD;CsjbUJhOcHiEwd4+i%JA4L|>0>j`Ga2)tLvVe~P`R9|2-~zB6 zYy(Tc=a3D27CaeLK?Q68XM=l!k0K{{7Pu6w0;%&NKM33n+!eeFS;2FFWCsrbM}w~+8~7@CB^U?Gz^`a~wf_r%{{Plt8VAWtgZd35 zv{RkJElUian`@F0$RS@=0i=C7Z^c(o`h0xCtc14p#}wlP7TG$tVuqbgR+ic{os9xI zv98-#MVy!S+D0+N0zO1xhiso3s_mXgn*^lt^ZSLdP=QVzI9GeQGC;NktL1ukm~Z~A z&^JZ0zq)ssj-H%S8*Xp5aTbOrL9do}Eg{M(ZjIcoJ{Vo)dT2f5SI9NfYnd`^nXd%a zGX08>YVSD-Jh$NINQ~$p(iqlU2S{YtyS^3&LS@+dgOeE+egEhTxB}5c>CG}Vhi}4e znc5Lz{xZEiT3OA+OI9{vTkwTn&`u{sg2t%_Ueqz!X_7QCm~pwr7Bd59m)hQfb&01t z+z#i`jeGhgR3|m&3lCGjYwvL6l3-h;GW>i@c*9xuK?G8E;t{Y z22KS>f_s3wgU=!h*bQ{<|11z>1V{7#_mBlV3p^9706G`&E%49amEaZNa&SCY32sCd z@H%h}Xo4xAa{|YJcOVCNJGcVKhkpw^9DEG^{{`Ur;ECW^aA$BQ@H=DyZvZa_&jo)4 zBJeZ#|1Sca5qKH+NAM5e0`N`v|8Ia-0?7wn3?2o3g$&?*;5zU|upOKLjt36|vMcxy z_;>JHa6Wi2xC2lhef_XPf5IxmG1fH1&1hgDfq+FU=&wHOXXKRpLg$P;@yk+l1g!0F zooE@$-!il$?AmTTyEUX-OugWQ5GqsI-b!B$V3P7Nrr#bXiuV$WROFsb8h^_Ll`ptS zFY6z7y8Bk7{N`#1E2@fXN|cy-Pb%DJ8F8)DQ*ku?>9)6&p+S8+q}`GzzyFuX(7E=> z#QU-vS6Sk+-)6{0)izC`IUW#6Yeb|(UP1qmGJ|*K6M1v&AZSM#F111SY_c=oonzb| zIK4T?5XoEeXdomOHn+%(j_IL8lRD$*YK|T|G^xYkpM@uNJhTq>yp9iHh;!t+4{3J? zp2~5l4tzG}#Pl{)HnXQn&b%=t8*x)P>~aT(8SjzfWE=OOa^%V@&TfAigB_HAAnM^7bk&6+Ca&BeGa*xG5vt*5+$Ho9Ky9i!TKZYCzHr=;-R z^K)7~Z{*pnn}wTF6U<>wn%MFd6wA0}EJ+tGrz2tXjy{Zcc+o?qxv@d=LZ-Q9{jhsN z)(@C>VX}x38VXBUc^UhW(mI&(PZ1M~07XThgTWc&O`5adTgOxcXgU?0uawnFpM)VR zEnddDrB6K5p(>`cy0LO{ZCC*u7i2v99fA2!?EdD>-UwH#O{wKi4 zK{y8>-v3zeO?Z2q{eKkL49)<*gol3(ke~mhK>qx~nSc5Ee=ax`41*uT!~Y}L0uBQ| zg=hZ>_%?Vkcm((haA)v)cy`^T{u1y!umPM6bT8kXz$@V0uK_Oto53&P*RKQb1#bp* z@F*ZX{~zJm!#V%A^Zz!m6&wz9K41*o58M^p1sn$64$poGC;|E9(;a%BfKPu1cr(}u zCcrqj2;2eu5I+6;;B`Rv=B)ubAMgmEGXh_rt<~Od1g`>FhF%@bY0~-x>XFr1@_pcl zHd-=H*cgIAP#5?lE;YjV5;GmH&BE7{iQKCx#y$0lc3CMGtMK@bOfEw!F=kX;UoJ1* z3X%IUCX%eJi!W0<6}w4%b{X78_6V z?d*K1`6=ye?)%gnUy2;mSEbAWw%K8O>&vvrB^IF{h+u_Abk6POV$1q0Q2Vna^jo|3 zYf^ffhePwS>n+Qxi3HfI@%P{D;TBWVxG|nOb(2%c_@+<$t5{G_xA1t%uHrY}Sc=65 z-n!+`3ZHj^e>0Axuq;Q(QXj9Zwm|kQ`f+y!ImS#!Yy|6FpJm^cQ(J%!!kTZ1+1_vv zmiqx0`kvbMU+V{5=KIOn7M_U-wHg=amYt6CuI+QVjqgj0HSut5N0-=mb&5gQw{;xzWvHu^MMJ>D`9oWcsd(cfE^>fgGeij9W!}R}4VdFjrA1eNT zj4yru1fT!!U>2MU&H)?1N+ACK8{qk121xHe4jusR0CfM~e}E5z4}nQ=1UMYL3;zG_ z!1KVv#U>#TsR)HVD|9=jA0=y8kKogt`?gsuF{{JIDdVcxtm!5w+(0PFS zf%}4=Arp}Q{x^cBfEqX#90%?NBro_B_$2r@Fau5lk`=rgbiitGXYg_40Z#=_0FMW4 za3Q!o_#twEFM=7+0^7hD;K#@Vz6ZVwt^rR3j|QiKUm^>519&~CflXjFI1U^Q?g8!& z)Hfdj%zpcb9`4MHxMNWj%==QwnVj&kv3PbXn{}M4 zE#lPbq*bmb)aB{^P9B$<`6%~|v(xtv(boqf=8?<}Mu(H~iQvhS&YNGTO%3(={@CQ;2@(w89uZN(y>2Q}W&?#ntJ$WD2skGJMKD zcf@4Vri~qZM+K9+;nP;E9NR~3L^*Mv9cgub5yYG70Hu#>BPST{4WD*0?+jR~ObchK zP*kQX$kqF+%lM1&*G>obH_?W5(O$>yP~?-y-aDJhrkH603vzxWthJIcP|6*2(*R`p zrH{s0g}DnGH{UopuXQUXw_cJ=&{ENxC3>G%2~wGNTgbd%bs>Yd9ql~UeJZ;D^Ywe?_ z72-t8IDWc(`SOB)ZeRAiUw})CESvQ`wnR6Q+uA|`tcJB+ndDd~{}z?CN? z4Ri1t@C=qyHDTinYee^TmiyY~zBcpK+qRmerOp+;Wjj}dh7GDuH@q`0Q)9AJwf{eP z5(N6%#Hhp3dXC-KQ#a+M$b+aPmto047H&t{v$Kvg)x5X)L5HX|{fggp3Tm0~IUsw# z={5=xd8^Y3AD0%&^&=kZ;dWaZa^*sXVRUR;xGNY)`RswP@!I!bG;*+L)b1k6X+n+)0DE0h}@g z)W~)xbmufB3WI)TYKjwF3(GV(FOVWDgR40=w&Vd8|G}(R1&s(lZgPf2F2k+O+BBRz z7T-GpanSKoHkcMEiCa*a)YEu&FHCZc-Uj)(V39OSO$}29!hv3)m+9R2szTqJeUA*8 zeFmO0;*0Y9YQ(V9_7Qq9Zq-~L%&nQotZ*vVAllGG8WsqhltwfObjqoP$CR1Ac&(`P z|C<;<*p(aque$*B`5=7$2f*_{1IQ2Hoxz=ec>lM8=YlbCI#>-Bf!D+1p96x8f5iXt z`~Mv9bReEz_WdQW7~CGnC*T+1_x}Yv0bC3&0+IuK2R>hS`#l3Z9RwM`TK<11_#Hg{ zyTCiaJHQ?wyMQ}__rl-*J$NCIKY(@Mhw%1q0j~#t1D*gjgCX#Jc>I3_{{mhKo&qib zy2I~i@B{e#H-Jk)6>J5^fIq_LzXMzgrhse(z6pQ-4G`|{dl~;f0(=X8{%UYBxCopB zO5jMKJ~*52v%o3fC;a{rkT1b^g5;yQs!x{1XquOoQJlbf*8pvEE;vxcrc`*Iaba6= z?(|xtoh8xg)xn)!Cbw~s*u$Rj*_UQ{QhbdSl0SZFD%PgPaI`HcyAIz>vR2A3jW4lz z^m1;s5plLV35`{^RiAUvdb_-4YHaIN1y3l06#h2G+J!OV5I6d3kJvCX@pLKR~FK)rt;s%hr zz_!fTble=KMi`8RP~mu}^-JVzkxun9W;bL8LU=Vhbd-B=3pT*O(HWg+o?&C*&N*(M zL392E8_%;=t*o<=M@ye)LoVCOsXA@2VG&n5I|*#PaFIjL%$Y-+gqk4?7ys^p!^Oor z;Njx4$uA^x4i}bR4P8ganR~!iv%Ir0?OG{KuBkGIcz|1l2WYz~n%cgkWrI#_w9`fj zY-Dq?0@`LOsePwD=;_&Bn!k8u5#PGSD<9LmvRJ+p?>!~ktCy6Wgs^n%vEx%1X~^%b zkC%+*5VB(zNA-{wb3ZEnb>aAz7MJKA@!bUtGck+7ORq8TSwFoY5#sMHR*J*;t|kSy z*6_>yv(N2D88RXf4O=$KDv=NC7ZNO23D|*>7c**;ivL|=GU6nTsSOZ{OFm6l>Qtgj zF4~(Co)4e4az*Za&kp00)m>?yPhZwhPJ3Y?NkeApm2A9L5Zc9BZHE{8&{_7}^I$4j zT%kU3fV+a9!r%Wh_-k+ycp!KHI2wE!zP<}OpaFE2|2^>ZH6Wk= zvj2Yw_+R*Vo$3D!(3$@8!2N*y|9>5P4T#5oHTWAa4bB0J!0o}0;p=|@-UD6_o)5Y} z=K)Rt4+ckp`+@s{-@@xl2Jl+23p^D37XJP>;MG7p|6{}8C}IohFr8kBDPF$TATE)@577hoWR;llq^ku zBF2ZCx051AR)eQmvYo{jP<`5tP1ohO-kqG(?0~hG``|)IT;JYb%0Ow@jOm|<_D|dv z4@Fz*xCw&R;=ZR;?tVT{HcKO=K+Baz^PN=IwT#NfA(#@YH8}+uCz!n9EvAz8*W0wc zD~#)e)@Z6X4tgUajfPI-EQst4shX?avbh^OZYS%bM$@hE^0df*egPb~mBO7Vu9+l1 z_i|fbsHCMl*EerVX}IPZC}rYnmyO#!I*9PES~=6PGg{UwLy>%ROU}>z2QpRH#T2F; zzTp0#`D0ac1+#rwFkU5pETD82{p$%n!5j-rYb&!8E4rZs# z^)ZaPEbRTg;=bOEP6x-EE+eb5=@e)U+4S)SzVGF}ocI~|(L%~ z*8}y6+C?+Rmu;X*y1bJ_I&r&eJa*;EX%GInQ_GG95%>FHAjy;sEZ37{Mjl6DmF8KO(K4kdOb{%yZTRz|0%lQ8Rz*r}G~;^Xy?BaI0M z9c9;(M95~4Joj~X4=QWR70I3YSEe$_g`%UgrGnq}GF4e^dNh|zHZN5s9qJg^d7Lx3 zMCPt_IF zVO>yp^vR{7x2Xa?1DoLpN^(Q0j~i3?dOs~Z+b<(-b5sFoJzQ0u;Xa;3UFvFpN6Uj@ z4=v3!TZfdMp%N11sx>4S7#w08Hxv5HKtw;3!qEn;4Ai3PP@N0 z+0(HwlLs;^16`(Ns5Kn|Tj)H`#AIs(Uy?@E*yd%|4#|I! z#Sjueep3s6uPh%wzAxH%MoL2H6=xwm-h+2jGEBAc7UX_oEXNX(R5prhp^6Wy@+|=VD^~qvqw>#Y#Ubbw4tFUH9%3OrBOvGOhqhmY^^GDB(XtZCiCB6eV|zGdZ@M}44qH0|u_P=yYg{l9qrx5IOa{}1>7 z$?snmYzIdI@%$eF{|qW%J2(>D1Kb__1b+W|@JjGZuoJ8S(*Hjgd=TFM1K|DO0i z?*e`ZZ~p^uJ$M>;ERetdU%=b{J9q^c0k;SL248;_SPF)K`~dtEp8m7ooj^MNzXQ4} z@DK3sp8;Zk`MSQ7+ow zBoo=fX}iICS_iG!vf|RY#&CP=suIGIeQvOR7!6FwjlNT)e|R0G1lPt(E|?#faRu3rLg=kB%@ z!LIAyw@EN^Cd4^1h>=U#-gPY8HUN{Ih}p_zdLedoQ8h(yx*C-G)9B(PY0V5(AMqKT zgcVBVUfMQ0T{n-W!IV_7!pFqlqDKnhN%+<#J*;iLCuE#EmwMlG`L36AVD2nUc)%Hi z^PstQSlmnKsT3pU_B|{Nz5f8;`WlKl83xgsUFAb*^}RS)#xEXHsy0M4CE9iNfSy{U z-|Y9l`nCb@#~9E&b)7o`>&HnUB|T??i@jZYZBLD>*qRezfmwt*9D87-W{vGermHl4 zy*6b-8gmY=!7>6ZlelP9x+1Wwar(gu-e!!pIbD!g^M2VfDX{)v14p~}>&|@T=?{rF z#%A17=Px$Qq!&+CyRgoZRX6IjI4GEo>guP~D5rOjx3JXMnIf6&LK9`u)Z4uUTolve z&Q8xnTA`owlW|Rt0Kx4cw2YBcndvN3*0=y}@SZ_Icg;O`{hfS%^jhRZ^`=(z9Us>u zv`Jc^A=1`Z733;*P^6L>Cz=c0?AVGIu-?YbCmh#Dr3w3+E__do9_s*x%wjv1wOBL7 z{~MJ~=L)YZz3YZ&Z@-wYrA&cR8_{Ba?H|SSg~$|a&e6Cis!vUKXPpBX44X8~I@}0o z7iRk9Q}=zsxHh=;N{;ni8|DWB>!Q2zkMYjsqS_K|8p9Pw)^GV2HaN!7Rtl}E=~1dz zLp^Lc2tP8FT9lMiMf9UpT$v@x2xKMd*QMl@jP=-aE zWxhIblc^|vAIwe#cG84y&?cWG;pT!*V(!`CSk;P&Wx|SX%X+f$U6P$1ek^n%KZAT0 zNhF_hun+FpxHvGai(i-8L>wNDoL0w)YqqJAGkg8ReL#}8!AC&Sqij;XjIVo7$qwBBT2MNe%F1mtw7_hq%wx?HC*yO)(#?Wx8pz3RZFc&C z4@aB48gBj$*IK&d1;4Dgj63t)Gf%%HZJHA;yJa`a-p-$#m(j+Z z)Hgx++6EFI8VJ_|N#>G;df9GTx~DqWcH=ZQHixT z5q5G|=Uiz&YZ^M{_m3G>O~tPklzI59I}r7%zj!s0D$5a?g^-@U@dE3#!00hURHr-| z;%shvbm!L5c4L|{YtcrM%95ZZTLi6XgKVWU#jWxG75e@6p8v1$zdmn==dXf$f%n1R z&w>-cAK~SH20j3;0nY+6U?un&y!=nWH^Do>zkp|fXM$${@%`t6-^0`Y4!j(^47>oG z0EWQ*!7t$LZv;M;oi-2ST|Ah?Td*Hj^ zeP9n*1tb@EH+UD&S%7WeJn$g!K=1%?1o#+o0o@DuG*Aa);9M{SjswSnKOiUgAou`i zf_s7^!7GpxTmzmD9s!O5?>n4x4B$lY9b^PL`+o^|43OQwW>?Md<47%_zC1# zs(7)>ix3zagGEe>5aw{+VP=)=KwgRrjnbK!_7~Z=nntqVjHGdSv}vo8?rLPt?VDbq zb_KIYp~`sE%#})da%?o+ZL=V=23C-6YbmrXeXH>kdSw4BIkliP(jPyarPhU!b61YX zdkQzPqv{>0X9a$l+)0?(d_L1cWQgsgGfmGgbC$y018vV^%EJuF3+Y1We^(7#Z{n&U z3QSdqCB%!wZ^p{y3b4Pmv)AnqcTWY?Ha^&hKi5i0zpD{YaPO4cfljNU?kKs|-OSUl z?&>i+UKa^A#H`Y+at{f*6Q0}7oN}0_6jR0l`njzYjRyI%&SvT2*i@;+jwsEHR@Vz2 zG*Qa#Wt4?ZVHDSg0{CwCRFg8(*{qFD(i%rZBLEy*t!}<(+ommR&$f1}tU2$zO=~M_H=Vz}S}n_uh;*_tGuq9)=5~tK*TUvdLCvGS z+TzGeB+2M#$eYXNAziqmTG!9^nd3eYNuPBc+{kP}V@7(KxYQLN$2l`6-TAvxwOi{X z<#Cr*yMui=c~)UjdA{c0nAB%}h;~;P=KId(ywvM>5T>lY;rSPS5Yq zUy-)D41NjFzK8V>#vWe5R?b#?Vwq=)u>y0;xfc$&jK?B>+9*&%RrjLbL*sHvUY`;} zP`8@fRBSY=g>3L;>Xz!a%=1v^c+B|)H(usl=39d##r0mg7q+oZgy3|soT|c#@7Hx z1Vaa@d9PSf)MM|{K3a!NY7$6>l1K}jD@Z18BCSeSDg2KR+_!mC3I`-W|>45hpZO6WSlZ8UVvHOoARv8h==KRG+B5f zlTIi^_;%?ea{R=vzjG{XTC6d&?q*gqGdhZ1FO%S6?tVnT9Dni!7El5ogV+Ce@K_+;|GwZp;NIZd@cC~7Zv>J9oB@WxVW0s13~w*q z|EXXTI3KJ7zlNv(B)9>57<>p^4aEEJ1G~WK;65PD`^yL5KCl!Ffg<=W{Qb+p7NC3n zu7{WZ2)GWs7wiV-0rC6t0eFA#Ir#c_fp>yyfqVci0e1p-1pft({|4|3@O01y$ABZj zJ-~H`a~=S^1UwC#0*(Rq1$P49gx8l$KxYP?3Z_8|G=c02OQpJBwA7j) z+(ax$->lsH<(L?^w?D9^;+&u!6qrq5u6zA0}4wO<>N3!cQltzZf9Cr97w zv8z;AELa=@Q=4U#SGf;)P6;#Kemxj!>*r zij>Pd-h+||42mtzrqQt|frI52Ir_n+S=W_qoC{@B5Dk$jN}RN706FuC#rDpiIm6+4 z=Vy<^T8(z;&h}0v-(D%>v3xV(sn9yiw~VvlG0R7L&K(1cEXNb8EkwmN#**Uo*hnf; zp8_E#w4SR3DMwsY#MAAWx})KYMhr2W93+mH(sJ}`3MO&jQ&a>^(}Mm9A1~q2dN|&g znDLh~q{eEV;mb~Enx|S-24a&bW{n${IBk)ghO*;udN(%&eY|CbPzZhV5Ze^eoTq@D33LZw3t3oKl6mmRS4kZ;-2K^gdV>F&LLD@+Yxz&R{ z^UW^N2K5@#D zGw)|-vX336ua?8cD2Fq5w#Je{+cRTSk~yb|0*9ke2h~qi%gh4bK(_AKQp7ND3zQ8t zdqFQ*jZ^z)+-5Yk|37pp2{5Fla1j$kugDTL+AOiu2(cWZO-)8Wc6FL3Xq2ZQ`rSkc zoPg_K*K%(DrG|p<7-r6-oToF=5;cVH24!OcDw?#zB`FpwB%U1$VXF^K=6}Z{BaDj7 zG|119w9K#MWjA3As3`{{0>b_;X&;QAZPyYVJ2|mz893~Rxpdf7tZ{QRL9(n{o#VLl zk0wXbQkwtHBHpaCeLy1Faw4++g3sJq<7DS+om_8CC|z*Kz>SV^DQ>m4Q77mVmn8CC zn4%03A7{ndsn~9SORNZ+r6K1B8XJKdpMH!NlnHBfYE-6D(N$5kN%;V#zy z_c!dY9j+^kPzWjqXN^~Hw{R-5Hj{q>`@KDvWG`AP;tP4$yb|TpEw0 z-&jYoEuz)SNf1-ZQ&!R+liczC;YMw4SfILaZaDbItz~LBOgbT7j1%R#OkBi#D@Ut* zDvDKct~g#1LQzJYZe^5%oh9a|HqfMcaRE%-$R-|QSERD5OfMA1uar~$B<+-5TGS{v)AFM8|8@AX8Td-^|5x#)k9_~X8~h8XgHyow z;rCwx#z7ri0_6K|3HU7h{te(o;5pz4Kz0EuKmq(WJpbpw6Tss^6FdgU9zb^htO5@M zcLbk@|9?HW7U-V9r-M3J56%R-8{jA)If48GeimE@-UF@#d%+bzcLhjxungP-d>@&> z`@kE(i-CLvJ_^WA;7IU4$OOI!-UOZmbcSF9kjy~71AdQ;;CtW`;H}_g;K`s3wu6(v zQ9!bU|3y~tHSiViWpE>S4UiqdN#I1V3fu$y47tI-f(BR&4hQln_$Kgpuoo-^L*S0! z`^X7C30?_w_TX_~7ifTRAK_;Hm%qV#f+N8_z%S@y$rs)S-VR<1Kd=mC8IOk%+66D&gLsT^uM%S zVRvPg2Ad&r(vMLC`c<~>_u{Q4Ki<~L$IPti=_KsJwvb48&lwea1#9r8R?^vnX6v#V z&Wul3zQXld($?lrs#4ie499Qd65F}s3UdCb*@{-@QlBWUJyPqPOJ)8FI^F7=MOffv zS<9tMRZ#Esl%P-hERb!#H~Qw)ui&1__9SVamdizO<((&`xs&i`4HroTCAQQ$C>p6D zW;PXgy;DGysO=ac#`e*W4I0TXNQ*Jm90D=xlr|_C37MhSr#Lpivyoh_4zOhX(v#BH zDL9If`Am6vrgx$gIL^@OZhnNPA<|sbQwUcTdMzhHXkVpt2vm6#L5C#6vtEC2A%xhR z_^Wsq;Q6Os78gIxI|PmcTq;_!>tpf5Bz7>;j)PqMp2x`>eu+z^=3eP}t~C6Glsy+- zhwNOOdG99=QG3$JUG6<+yb$2>lFg=X+H9lr1kYEY&^b_^+;(AWO0l;t{bHq`%~6AN zl#qBzB@`aH)@|ee%6YK$uc0k8Yqk(gBM#BrLebKjIydIxwbohYhGcrfjnl8s9d7MM za7BcInSM*>WSdSobOAc)u<^L!iw*OE7$}6J6M7ha6x_1W=-BQX_$%vzWK;HgpX%P@ z?KBnURk`MOAobWEZqA0ff``(Ud`;YXXxvB$T4#=3T}AzJIfGsqBV6Z6Hnkh2BE_L= z3qGE`8?wPY4M(vsVz1-!MS^7?2^TmRHcL(F)3S&mbWM|MqWr&HdEL<;Oq6Z;XZj2?=!XV_zd}$0q*Sb9VkQehMoD|WGIFo(^(T{6N z9r3XiFY+I9_EK(_oSpsb zmA7X_@@ZL%d+p%6v&hK3QAlL{SZtRchmB4Tilik|CBHq~D$-U>pSd-^y*j8Uc+s#C zZTkP^ti{(!rwsoe{Qtimet##J087Ak;qheya241GIzYVtFW~Lp1IEBZ!6I;9a6Nqd zTfpnV3xNFpodJ#mzkzRmKllf51*m}%_%1y9`@!46-+>L_Y;YEcK=1|lLU{GfU^O@m zh=2bn{P|D7*TM6^UjfL_Q<%1rQ+%D(1}*g!eTRCglb@RplOK~Cy_IR*3l zvjOoZLVa)Fl9K&vG@gY7)$ibe0I_9UMnFcMIFa-M#E4&b<})>rsB#>e>u#{{^jmCj z2nAKdgoScm@9CgYC{HkCSduJFTnhsqbb+LS=A*ncUfj?`IEZyncnjxqgmUd%c(AW* zcjE0q&()S+l;LRkRVzG~#WEM7aaGfD+4RJ}Pmfk4#tbgwZER64-#AdZ+7FblJ{y|evBKTcMdISYCMZMJ* z8Ui1N{?45smghQYI7A>ZdTn4^8exv&718V;ZB`Il)+()WOL$w)i=p6g7vfqFKREG$ z-6>YA#`GjEt8%qbQlW*y>Lg9&WCN!^5!rfHBb1+trYbWQngC^5EnX2>=*n(dYcblo z@M4s6tQX1?)`)plAd~hjz z&?n4n*8SlxnKw zobbG=&AE!nN|n;XL+iM`UY89DPcfv-B@qaNc!!QVGQU43FXp%Tf(pab&TZH(C38aO zXSw3}^yS1~*5c;2k&6@ce?j54M8s#KLmH`yx<{SzR(q#k%BjeKS#&RjHV+xOr&_IJ z711q8P^9UHTX@w_OKYkG8r-*p++pZ(j%5t{S+bz{_-U(zt=1pTiwb02KSzr0t+MuW z+B{9mG_6ZiDt*!+aS?4J@GA~)-yGDx^49zfi)oLMy{)Ex0S!(oQw!)o3!2+MSkaUl zJM>iDyhS?_%4A+)ZKIZ6cc$xDuzi|v&Z%O13nFMzBi0H2ywoa2Xxn6$d1Y;+F$q!W zR$TLLWEuMvnH1vcc+O^mLv=EE+ATIqOS_$DV|qGkkAxoC><<;k2N`@yGLt&M#W=l$ zx)RJovLH#dAr}sMN&mkG66Fh?{}2BEeh$yCGXT#APXtrok>F?`S-^L|w}E^DN*=Hj z=sds!fbIo&A2NW;ygcB4;Q4<5J_W7?uLG|Iqu?Ik?%;j!|1SkE0Z#@`0>^`&!1wF^ zzYbUgeh9DsJkSP^ftUX= z_%gT=yawzAr-8#j0lXES{?*`B;1X~G`1RqO?+07J@!+fQ@PWT?^Z$c@+Vo2BS^mEQ z3%r#xN1WvCE{t=~9el}7-c1cN1t?al!%?zM*&=DYrViJ1 zM6uZ-F*-TJc~tIP?d~K|>b<==RSRxX?ZL(xen5Pd&U0b^Ol^wx%QZFo=giysA+Cmw z;F=c2&Erlk)oL}I|CRRv%I0hzoc|N-nO!c~Bst`iPX=}`ee14ROxeV>TYH+Wh9dMj z5=6yzxlqiLGkASddoWPjo+Xb9ZdJvu!F+t@Nb8Fz_=xhV%NBE zK4I(z04HJ?tIJXo;Sg0d(>-^!O48H56YFDrU&leq{IBcU7rhzW$v9@*CKJOjpP47l z^n(A_bMN^VtY#b4+@tP#%rHkvsL+VK38ye4(6%d=uQ&zq5cALG_-PTI)54^bMB7hU z^`rR`vvWbp-b|V0aN1q}VI+`TYslHS=@ z-s{K4kOtRKJV1wRl<(8*^SzR+f+3GNN)x$dcg{i`oZ=*nVqc-IOdvEr8Cmm_t~pB1aGTrrhWA8si2 z$Zr-*GLjikv!k5l<8COzbRl-gAMtUbirDzv>|rEas(Tf~oC4E~>_E*OPGjUxWA>Ft zQvc+hXgCm{F=D~2=OPLcWT)!uPPRA!l^@l8c_;K=8?!1x$0j(jqUYytbv>CMD~V=; z1~!GG^o<9kMsoAp`~N=yHup{NqtgFh#Fsw*3ZF0A|4DEh_%b~G)!<@q5h#Gy!QWp5 z!nyx1!^b}v$glraumKdnE8yc>pb2#E-+e$h>|Nnd- zAAX+#p9CKQSAt`}kKy%y1ilLX7VHDk`OBZ*VW0s14FCT&&<2-*zXU&n$Nwpik3aeE zlaBu`;4i=*Xlu3iXTf!VdjcZ=Sy<;Wa73*q2c#TP4lJSjZ=@6ND7Fn4L*~wldBNB% zZL;GBc|jFQ2^{gBsf#wN+#e>t%Wm_-%&WmthXU5NGutcO=4xG{jdZx0R=T)3i4dM0 z4F_^Gw%<(3(PomjRPff(!Vy+yy2ZNZc+~V3i{{YSR#zVXMo-JO;aNejk zU6=v4*M-+cOMeZEG3r}c6}p9TYibK0aFtBt+Nq18GJA0>OL9%F4qJF892($r4sgl4 zY@<%(Ehd+7k}xOlaZRmR>M!V2A2TboM_2vr)HX-VWvo$%{U!;13UN%MTkvYmx^n;L zOKObun>Kx?Ov+BY7?l#4&5{G0Myd3IL9$KlgEpJo6g^n8M>C}DD4Mn+{&A0=0TC4sV2ZScgY8||zu+UKW&dKvx^Nv< z2cpnYS8MCzQZ38bB<>$eD2~S)SPJ8J%%2Wdq^uikJ<(^50=gu)shJDrce_Hm_~^XY zVDwZQ7CDr0C^*##-5Vn5JEVbQaPDnShcH6$FX+p;Bsx7>9)v)71u_|6{X%cY_G0d? z$UbeFuC-@b{pea9?eGV1QWjrR^_on`aS+D3!9pW>S#5GpZPq;9(}N5%hTIkPb}QKm zF63&kXxMJw^S&L26FIT@q&!W(!>xRHY2=pJ&gnSEcJf~BSe6qL#9AcBSC?_cu0{eh z&?r`8NK18D0*ANpysHsoQh2iv@tM@eoPbC~7IFaY18z85HIviC2_76Nk_{mg%kBnZ zmKbU3a#eCOOPgy}+Xg%DG;)rYcg8{K>l=EUw`A)>{rX7(5I&&U(CrP<(%=aI*@0S? z+;Jumm(}y=I~x5;&X+zv zhUfndcmvo2c7qvk4)_(k|1ZI-z~jN=z@`~hA8 zX2Ap)2Pc5rgAXAWcp7*rSPjZxDfkMqfH#AS!Kq*vNH*{%ARmBF0Xh$G6u2MwPk8^d zT;Rq0Zi92c(cpjK|9=Q>1YZKzf~SK^!FgaY_#b2f-v;jl?*LB+vKiO{9t9o_WIymw z@I&MT&jc-S30MJ^gJs|k$Om)>z(0YrfaC?gK|b(q@GkI9pfmg%z}Y}|2eAPXvXTKR7@L7k(Z9dQ(R$b-Yt_na!d8QFT} zm0LcgT96deHLFAvm7Wp}D%P|#Qf$@y&3BV|K(~Rce;!3wVeX3k>K&teD7< z+U^ONMPT=ZK^Q~_)T7M5AO+())D+FEZWJ*w6>x+mv~1##*wgTOVXDzobiV9%B&U(= zlQ27n#q+l)%^A&ZSwfPT0oO<=9J;o#Y!S#z0*j)r?8 z)bt~NUPCl{*Vp1eEYa+JQ60T=lHuQD_h!+(t3AtAdIWaHMW@@&@GO3oR^@=aa%^Zi{(j^l0p8g z1!qB{g#A*t$^L%=w)DT?CB^?=%$GiY51;=+a1nSkI3L^|ycYg`26VwFxHI@Z{QZA{ zF9F>LI16O^|5)&6`1%{b+rXQ_UxBN@J|Lfe*TLVv7d#RCHJAmqru1E;jaQKz&GI8KLI`t-VbJh^#4DANB=&!5j-2n z{$J+;N?q*_5;W^0rsFpsb) zO)}U%U%$L2e?4~8UzGVC=a}5$VW2<3sjhNAg{0+%-zo$m>7l2y44h}qdO1HMKHlsO zUQtv{5hoSzUt2C*!>CTJVUBjJ?fRHPD#AkYGaNGMoCNLU?w66$t`?=whIwnq#PL|% z__yqI^wdcI8M$9pkxfoXO97&uFpt*gZLp_B#5kZuq!}sMu+q?)Y1(N#*U_Pe`xyL% zQL7e)0Wn1%2ZwaGbDY*TJH~_Gh{i5mv7U7w;mNEHzWMP&+>Fwej+}(hm{6jYSRZ|% z%6iijc(?6kq~B(P9&Rql@xUYBWVpfAKz;0hNBta<*2@-b)ne>t}R^?n39r%sF{e zKNcPNX-RgXWlYIsZe{eN(001A_ikq|qz+5Ff~2J2-fFeUoe#!IGB9D=Yey<@8R#||4)AYb;kb)a5(ra z{Jrk}+YTb|5O7cMA^3Xf`CkL7K<5D-3myfe>;D$`2Dk={f>XfB;3RM?_$0jjzk)Y| zX`p-l{seFTNAL%r`~Ln8JP(`=qfXp8&rC-v_$m@2TK%;2cl{$ACY<<6j4^0eis(KxYFYa3}C%`aym1 zA6^LHc0RFO#_Eq3_jaxrK5=YcwAWeQ?L)35g#Lc~|BsyLQOzwCJQ0PsXL0a!hOwH7t`9MT@(%>zy4d zhW))sN;=Ly4ql#IVH%!c&$**`1~eTs)wzu`^tI7_o#no^xv$N9xwj=6sZfTgTDKyb zn^NZr+`YQy!T!hM(9p_Fvu3KG{Rx%M6`(4tUWuq#Y%Q$vPrWs2tS^F#rrESD5>x{AR2C( zyTQBiK&PQ1M%=TnUKV;AF?qzCZfsJLmz?v2%SY?eI{oG^Q#7hM5ncGZa`E0aJ;^~o z^rVdz@?tgyL>V)7DS8m;(E4q)dNWP-xrUE+&M+Q0LtY!R6XbD=nI^s!trdrHy0-GuqbyFzaDe(g(3b`*oY#ZTE= zi7D!7Jo6WX#}?nS8{{jDmBXyBG)x%?)8;pkLCX=|m{pRtr2)Mgjn}4dG*shyc<*b7 zxB7H$eAlV(&Ax|%tShmmj&IuZvxCB=q`x64cPWWxrJuVgPD7NqG-dr(ILQUp#g4{L zKqCHf%<{rxD!HlCar07rO>m-kbsv_#SWOYTQ=UUf5HFkVc-cYF6fp4- zmRRu?))!=J-d|Z@%X`ndt*Rf0#AScf2Vu(W&kEXa?b@$NIj`~{Axpiv0t@BMOu!xx zO&JqDw{4Bw1pEik zT>vxSQD71HBmDj!z_-98xHI@Yy#DXN*TD=z%}sy+koT&viC28rQmkp+qA!I{$(dH3oZxhlWQfH@gHVV+6bVw5YgYBX__|E zyWNN0l&XwnsnJd=a!|NA+TT-xIchv(X{XzOIAke;viG{G0T-cO`oDN(kwcHgD<9Lm zvRJ+p+sYDrR2|P_jx8@8d+hiWe(H4~(Z@?pHinQLyU?z{LXC3U9J-pQxU{%Lm$2=& zbzY2PI6i4>UG@NFNQC%%i3$BF{K!)e;w`9!k_r z`h~#8RRY!^TuWN#4%k@mw`5;*B{vF~{6TTd{LfumI!0JZ>*SJ)_PXuT?j0RIZRLvG z`+jleu$&_u^65(nj(mf^7Lqh%rd~;PtUR=fwb~9RW>F{DbI*^dWO0T1#ErvJ@l6@u zo2(wEVg*LOH)7a?E=h{&3#SvkA}A{I??6d823t@G^~qF{A>unvK^h{-i@NUJ5)q4F z?(vWh(LWMTxsZ-d7JIhci-{6(RH+yr5{r3PM${&Mj?JcDDj>+8%(Qe#Lgl5B@ogwM zg2rZ?QucYjgnElh7jgfVDjh!AL93?=4qTG}V(ytAVCeyzijy12E{kYyXJ#bT&;yL; zRFau_lZ@+3jAVQ($4vKUTfhn=m+#?q8p*0sP)Pa{kLFa&_*o$xQ~#@6sfha76UUF` zR5br26B9Bg9}%H~*Q6<{@ELT0jI6rQ62h3*W=XGRZL&rK9PM0@e7I=K66fC zCUP=Vl8GJO)8x&BbM`wkvNUhwCqX}>x3eQ(t(i&kNx8$eQz|CT9<>^%piEgE_O$@>yID-RAy9C?WD@t0xn!J2O*z9%C&LibVbrvI zub`y$Ns^}qJIo4@G*gp%NwPd<=HWqKhLbM!4HMzC*!-Znl1A*fYiHWg_KwcI@ME(E zwEJ~sw2ca`u~P+v=*N1k3TPAxdOK6{-X{q?pvakm?5zx+vd^6r*|ce6hiim%E<(Oo zhEH3u5{(YaS!MXdeN39*16Q)ScB3BR+FZqc(>*)wWZoIDRGEz~Q-z{3T|uti^<2iq zpBMRtn^2++>!Q7m7NW=}k$utCFq>jb5~&F+NCU!8^kfW_a$Zd}0GVuQKd4q=ZFl{) zHKooxaQ8y$W!WYLemHh&Iq%EPp{L$mg9mF4UJsC=MKOCwdoOl!Fn=+FbJes1SG}LB zu#>N($}ELch8_qNbTTp}k*W3|33%@cNeWwHhYJh8?wTxl*Eie%FJ=!cq#BROHQ(zC zkuRQWfnFKKR9T}ks7C$d&_K~gdrO5lPUpFNdBL9gKla>|p1FW{M@m$bnHTl`+)7U+ zi`H}1a?^S)PkCI?j+G>NFtnbL+Voug+32_7v})5x%M#`B%1!>0c2oSnv@!qU_5Y9d zyr%U2&jHf?9|IzAKky0o`*8Q)iTrVK04D?8@7DzvfV+bq!NdO}_-pV4@OW?v_zFDxm%*pO;)kJBQ} z6nO~3kw(Saw5~Sw+0eid3;E=v2jR%1c9YdB9EFq1LH9ek0Cxq|m{Z*i8M7aX<0fjw z9Zj39OyZ7qbhboOD?dLIQ`#48W%9~o){h(OQ)kt?8@Tb>3kuxk;bqrzJK0R+*(vvN zi{iUd?4hmPd$=AYPR*6V{VXk3K^8Bm8k6TN4qZsuu6GTW*;p-9lUyo#n=0VSuNkh( zOI*$RxG^cN_tOf>{j#PWTvA$4J~z!U*{+?@**-bKwVO-g))1Aa%YYl>IATo%xytjF z7o4h`<_Ngo^*FvC=hAEB@Y12e`2@Be(Dq-)659K3TkkDG8EQ$(M$B$=^A6hr>|JfriT=-BiYat zzgL!zAKw?@PCu!br_@|(?1fxwxtbl3g+^DbZ$&TV~4s7T| zM)8tpSyYluk2mc}$lf@+kdaa>gs0(qk>+15I%Dlp+el?XF^VyT&ZKkq1Ix-3!W3dm zjp2sa8hm1U26oBab*}P9Yc_9i?2n7oCySlk?sR8(*|G_oiOr0ZaV52E%ciwc%iIgg z_|27r%SKnNS}}V3XstdrGIq-H<40;IuROVa()if;cx~0lsuRafIB8XF#j5&AnGEV? z;SsIrvb>JWblp9kiG5S3X588|g3Gl$PnG^Z_TB_M&a1lr9~#!O?@QswAVWP)YCr+ssH~$4>I^lX;%6mUr%Z_j~Ty?zt6nCqeZzv#MNP;kOW!Cns8j zt-f%k&9yLlrWU!{nk`;;8l-HsO5^g4wD|4_bYbwT{LZx_jCfTiFwUCMg(q0p`;Kjp z1)a{CUaQ@z?wy?RlC_RzoU3V%h8sInwcJD=YT4Bsqs@*cuOfQ|m35e{6yXu;$YND_ z{rVWOE|J5;njah-X`>wJkQomAPj7`!4TM|%Kkx?;AI>X5=n zhtbE{mNCu~~#KyZ?s(<(wnps#r`hoAs7m3!JWJl|}R}VkL&xyDID7rX(RD2YFHu=itm@Yyb3ARX;Wv8vvKK zkw|+`vYl>0+ToQkUd-KE0lwIx0P0(PUgml0GR* zFU1blZd_hs+SvDjkL{i`=RS7ZR_oQ;CYyQm^j4d4`CiVnX^TyZq$>GEVD!Qh$C2q< zLXt(cHR0W5r;DdMELNVas5sQ4+cj8 z+4-*mF9v@G9tB2#bON732k=So2_WD9o!~Lx_TYBl26O=51s?&g25ry+Tfv>chtLB^ zKcKq+?*!EE-vFNm{|Iz1fPJ*2;%7zSY72=k9lUDn8jzSszn4s?L(s6(VzZEA@ju1G zIO-+yBAzki7H*ORxObatIjfVwld4Ac|*iZ5(&IdZlsZvyshGegKZoE^dzc2LyX z>F=nWE#>PT8swZ{@Vk=7f@SJAbdK8z&S@j<58>QVS-pnV3SPg zgNlDE%-)6eW4F{mXQHfGyeQqo^;YP^PC29wwM-L*lB4HzSJCJT6iIirMH?L%DbFl& zc}vT+H@np{c!S&UCHElHTnhm%=L@>Kfk5C_Gm|;lxaL8Q&g2xRw&Lx0b3}<6Or?tD*YUYOux0f0f=Z)YuWHmg51v z(~?Vwf-=pgO4N^40tceV#db8^;*XE+<7UE)K8|nU3gO6^00>;GN+IfY?rGkS-t}}o zftI@LwWL{-DQZOPSv|1Wsq$Fs#Is^JsFv>Zo8@ao@Cf$$Q1OxSc00q+n9`L@@~g?D zN%Vz%(%4Y;>>%su3L~d_>J#|8CB`M~)1w=+6ZGWKNAKgj@I=S84tW-L>Fk2`G8v?v z8N${~;`K=f5fUIRT&H2eI29QoIqjSn+r;T;7^uX@Y_m+p9v~n6!$VhGox!u5$mKPzZ&?9}v)uxQh6 zH7ADhkXOe0mZlxSPKIrDY8E$IYo0HPo}Hcall8B3dODEDj$?!5-8677;gw~lpuC%; zPYhdls8rfhVeK#3Q$1#3#&O>6A=gPCorAmL*!hl*aQUbV+vUpgr{_Kb1*XwO5q3Zf zUlb8sJ(}C=wb7chv1G46)!{cPOE0SXYsVWXs47>Su(s8q4_TP2V&DYHO?Ut^4;E+a z&RV2;MbhawO%3gy>~sjJ?*&1?X9KuXe6as5tFW{<_wViQ1V=qQwXAR63F?SVb`EG2qYs z$xEv56B}1KG6$hu=fQY}$P=!UG^KKs3u6G2z-h;@+^a#Cg-QVP=OrvsGA8B+9h_ zmbX3XM@6kMo0+g25JSBa<&njmdsH0D~y` ze|WpK(YU1`@aTx{*^$!{#Srg!M(w+k>}qCX2AvE{@}gH z^poH?@O5PPuYqg9Yk^|?$NK=c@b|Bg=VkAICfElyf}bMOzaBgR+!1^TnSL5P0(=TN z{>h*UzKy*8YH%@l0$2;~0pzd$bs)R`6TpSwT(A;66#N2N{pa9kKtB6dgS}t^+zs3n z90er9zXUuDG{9M4GdL669(*4e{-3~`!PCLhfa3Pc?!Ow`4g4p4`%xhIUi}=nQm0bM zL=d?;J0E6$;DjjyJziblG-BR2k>Sb@y!^S{Pa-A#Xg@e#W<0 zh#AUhGmhOM`Iha;m$og-4iaA^*k&Y5%2+?-Ct<6d{S$B<@%{CBSyHiO%+f3;wU3hS zOtz-_3rgOxo(#=sMyz8$s&$dxl%NRp9?=Ank`9M zskFa&B(`dvO;)XvBC!6cszbg}WEh&=FiD<|b_%R9nl1<}x*&oQ5Fp{y-CZsYB%5+` z0Tuyq9tpw1UA1Q?*7z1sWApYA4k`gJn2TU_rwiL#sCwOB?6qNMln=D6VYTymtA}jz z<=U2+c)91X&B>gL^5Mp6P0KIL-d$0D;-i-x1u;DF>&$&mGaE8W7pyuln4Wh0Qug(rFVb_DNFeE_ICa`{8aZ-m)wG`X|veQRMVu2Av?~-n#jggs7S7a{F{0^`&p6Dgs5GTs!ozk zDVwbL_X>GEga8VoZ7k!ur;_c60$F@jI(y7KgG7rpaxC}ZhC^* zPQXhkzTz0gH*)!8^VHv-PY9&U+iAFb4+d$<*D?Gq)K#7N(NVmLa~ZD07kK4xnr*RS zgqv?Jz^0WMW z%_I%nJ3&1|#-*5Juivy9=9!7gfS_PBRc0ty0>&h(%@obKOp>ZuC8%4!sx>79}aZ(M1rSGLM;iR*4l(FrijV3GSE2sW~dsQe^CTUVTFO#_p5!TS9 z6Or+u482An9mO!~kztZN$uNunFTrFsG0X#dCizFn+48m}42#r2wpJA2Tn^eOaV_dSoW;&yJfz(in|D$d?itMG zmn>EbSQWQmdoB4Y1`nFH1t^2VqR2qjE|<$wZ+BO41@z6_UY$(B1)}JRG83vji)J_NB%i!ySnNM}!xedfAF;%Ft|AP4(h^N=b!AqcAzEvG zuc%dwiOtUwt%bQf(~%Ic?#?b|obOQIU~4Gmva`nqfGM-mPFStfN(q0KE2pm8c1Ehd zP{;OT*z7WyT!k_l(MsYWhjxd(E?2SKY?Sr$Dj)_ki8tm&p5y{r4g8L2xa2Gmt(&=Ka~js`zM2cUcXz71Xio&;vVUN8Yp1B(CmP;ggZV_~C^{LoxL@^%qG z`wkz;Jz);(oSt7bk(l6W+nUomxR!L#gi|5X-sxHalcSuv+1nmVWth%9YHb|(Yj(YK ztyWQyK37ftijRpmGme`ZUyhVR6=WwXsU8xf><|4DmK8e-yb9_8qjsit)ypYq(SISz zjcv-_Eb=Tj%Hi*@!HbAP;imB&4E-XNr#5{nG$Z0#3FYlAx)3Hxn;RRx z@xhuAvhoVan>-Y{(Ji{=z3nN*Kj*bTDP^T_@mR$e|4Em2xQ!~8dC6&BKDSEjvXFbX zpnBe>4I6ypArM1BgT&32cllNDYEJ6sUYD~S66Ow82Nar^=i zFC0WDc-@DwHc3O7q&=l=(|zg9uRDAZKgd)b(x(t#_C4jlNwaiDS$K zZdXidQMtMLM)O{uq$cI;%$_!X>uSESn&)x6!vt5LXBd3jO?{%FM)Ua@=6FekY;lcH z=+aCsihtIaQm8<21aMaxgf((}oixJ=ZmPBplW=v$S2t&8bwj0Pt>N8>V~=_0XD7aK zLEer`fxa$PUn z#IE%0YvV1Cw80(&Yuz`Y7} z(P`Htd8GZ6B`xZdBS>D9eSyU2kv^ubWJ3%y2Vz~1?O9nmz1~fN);sJ z)hEWhvQ)>%;kVAK=*iLLnK`&taaJ`HIQ(L)5EFNqPEC*Ok5!K*gC!irp1|8%>OBybw4prYX#4snhA*9n5m6V3dPK z2@k0`4nRS=^rT+xv^hK|<$BOBu#F|9XE+NLvRil1C&V%E0-@5ZE(svYGiSyPwBI>s5-JsZE8|)bjd4avd%%vV`H2bX%{Ig_ zt`zo$$Q!5yLAhVp8cO=gSmz`UrHB>Ow}m&Lhe_9#^nH)6NfPLmQeF3vnDz2|SKx%z z4d>75y4u;Y0tGSiu&NMAHs>RmWZJRNsooFy2FLb>wie3HGK6Q~Z-V#WgPY>je%Y6~@e+Pxw6biIzZCK|_j zy<7=#DR|ePnIghC(Kvn>FJVvXT07gU%Qh(QdfmpVUf*%S?(I7_o^AbB+pu-(_KmfT z+t1xxuU8cWL`QnXvF~q8PMJnFw8GYq<)E>0n-0t`#6o3{H9NSv?a`ms2hN5i<5dos zh)ol>wr8=VIm2+IEJ`h;aSoS!mmW&BZlmkQH^he5$UBoE|A|@e;H0?UL$ocCAg)Tu z|7Rd(4k33+{+|sZ=jV|1-w6I1ya7lTa2og_^8OFNH^Cc#^Z@q(A4j%-4tO?r3iw0t zNN_iBSD~o~p!fqCQ?ak9 z`%PH(q81`#L$Vw)h0yFy&&zf;7VY;^0ua$;;DokPhI|*R*~^8S#e+L>E=hcrF+0Ua zsusjCIpiC1Mt^O5PfVs2H9)dpRLZOsSKG&Iy1zHYK(qh%w$%B$raaB5TnA@t^Qy)^ z#O@uBg9W|}auB|RkBs&=J&ga9oV%Xvm^vkjW7fH~sm7&?_Bc2YsqI~-<}%*}bFm$! z%RKIJMsqk+Jl3`nTH?Ej2`}-hO?E)g(}&Gw9@{nEdVV6YhP{1y(R7}fM9H@sOCg=T z8;ikn5j8zGx1l*VeNaf02HXyLS;CmvhSuV6RLT!0?&-tj^IGKLFW|OK3<_uE}QsWYQr$8WQGk<5vEYMvJbdaV#bG{ z#tX9|jhI`eqEpRjwj^iUv}xMbk2P&^X3{?`@_&oeGM$ zjJi`a)Y2&feJJ_h4?c=*2qJ5!#Z7(I@jRWm53NK}E?HZ`4+c`VOW!QYSuLLpCT2kq z(8{)0LUxV}+pYA(3A<>uLqLn+I!vo;j20r6=xj|-Y~)5_Lt`kjB@WFX37jq=kJ4G; z1d6afLp>n!v7Z9V9j(vFbA4Z2j9qI(iQ=0~w#>wKqL8?01%6%W_31e`O@1la_xwl} zn;wJ>KYkawQxwM8^!UYKGQjdz%qYJC>#s-NsFcr7!_vlYQekNq>iH~AtKGpN|kyX;W5aSs#&4%u#_{#)?(2~3S=2dj3lAwVzb>S zwm*5%J3v)Q%}=7zd090Ym{0)2v&}7bFQw35nXn|qx)OoV+yz6=NWjA4u9!OZe4(w* zaB(@-a&j!KURU%TB|5#kLS5Fbq^X>AB2Jc%o*FAR?B$G(ZgLJ~!o||Ms^W1*c@ttCrg0&X8&TRo70|oWHAj<0qpNY`jj4HQ z)FbC3fc32C-Uj9|V61 zd;t8HDxAeRNs7OuJ%1HAP(y;qXMuB)vLY>X$^0ZGeGoejP)f;*Nv+U zJ(&bIUw7vZ4@h>E(BHk*aDf>1RPoI=7QD_Cw>Y6Fq&s~7v9@MdNAI;L;H0NHQm#W? zHe@Vw^AgTyN`L7YrZHtKNj;OR%YSt_P1PZabtq7vae7@;j;CMrl%9ywMNISx;SRKDD|h$A7rE{T`gYiqujG%tRa&`GM*&JSz$r7FAkpZSg7h5_&G{7vbaK zPJ2w7b_hfO@k(;@x2oblvaAm!wo0l|{3J|Kwu-i@`_(5r5A_NE(HQ1l@2YqbuZm`E zDVDAE8>w#B*#Prg3M4XiM}~k7M9`ezK+#m9dHUMfBIaf{1%jd>RyQ z$>ZW>u*nz3+SkNbH^-8PX==&Yx}`Sx%F}l}PTxp2q{1s*&)qDl`)q1P8)-gLhTApc zyPhU%I_;aZ>x(|N(XMp)vJ`9})Fj_2UX{$(OEsv_7Rmo-G5HjKPV)Z~g24D&fMNwG9^f6pzaXQ37_`9z zcr4JFfO`SO27EWT61)q%6l@2-m8=fF2fhqm2sVPBAftZ>Tn%0Wt^fzX<=_PHHDvX# zf)|1omZ9h@Duu7eZQN(cY!m(&-wpb;7i~mx6$q{ z%Fki3`RP3fT48W{oQ~GOUi6D8p0vZr+OI^W;neHXxPrEnDyj9^|B!8 z3~M>uGsxF7u0fpI z3k5_b2F1Pd&EUYp&>ih->)_vb|GA6+0N=Me!o zjY`O>kfFwl8unN^?Qt(MwIX!AqfN0OW+3FGIrWfTlGy*ro)E_l&M*q zll*!Jy%|rTdU>qf?6e3NPAEE*JaB~)>QgF2%PgyclA_+?^uc6fHCG7)N{}pscwC5# z+F~+2{Dn!r^ogGrRe#~PIfdV>9UPMDAt9-V#;?#AR-n%|5A;pmc%Qd%u;OUsm}&o0 zF+5OK+z;uDB*2FDb~I}ynq}JSeuA#!3A5T1`^C8fo3_mcVsr47%IT(p0^_DoLgnO4 zrbm9R`do?rT+-#0XU%7N?G@`>W3iBS6!%jRhKrUG1wODqTs0A2c8amn%^YzzQi0{v z2zC?edorpU&4#n^0S&7oRyxoXOpm(#z)6ZY>?OR}2Bb3Mcv8zATv7)V2n%j?g~<`? z6CEQ~`FncTwXu(FOw9*%?D$Yr>c+PSY{$1;WfvbDNXO2GbEM3820;!Y7L#!&)>(NwzNY0ktsX*?o&SF zoh+^^P9=6g&MC-ttB7vUK3gBq82rvq%=xlNsh@^4er`@ZXdHk_=>PIgt$Uj&>EsEk zWsrrRS!?kv9aR%WNhrJb1=`^dnnWg$!0N={)}aseSd_=P4&|W(Yg(g3T(lrPW}_gr zLO#M^s7uAI+s;k0i%?SR53i%LiciGj*zQAhrzW|c2Cb=WA!c~- z`qR)%V-gK7{9nV(RY4z}GD5qSCq|hgc4}U%Poxn`@Hw_?E#tgf3A9OB7wD6?AMrgj za6@b#*OtjKzhWceikH6ZkcjP>9;uLyiD5vsHhS`TRhj>CwB>$hofV>%5Q}VxJ0qUdJf6oOc zfL|lazY4TL3k-u}z=Of>fv+IjzY0u%3ivN%`X7QXgD-)%fGfah;I`l>@E+v)H-k5U zIyedZ0=fPT;5?xF{=SMF|4Hx(Fb^IIu19{C@4oEuo5AgXWcUSeJU9+qkIb&P{>K5` z?{`=5F68t&r~>)suK?F0o68q}3pg8$fFB&ibMOW539uh*04IZ0paOo0jQ#~6pZk}A z{or(PEKp4UJA%*9pX$?BfTyc(dw!zGu6Wn;vnc*>71mixBv}YVMYqtKGyST86WTno zchDX?vFYiLADD^tP99dJyT0b37hl!+Bu)%fF!zp6UV^!6&&A>7v=ZebZR?3XeOlsK zaEUW_&vQ7%l7=6W67*CAn-Z!-FP!FDqLdwoiDMoF2jxol_*B242YyeK>ba_XYP=^Z ze3vo%QUFv8IV{=tF}L*Y{A{a(=<5AK$8xQd^3JW-W}^k!A{v>;zt%Mijj4skq6F}8 z9u;r3OPjN8r}WC;e1&}-p?gQ_;W*HUNMPm|(<%CYwGZTK4)zd9xN~ZqX`whcKF%r$ zCa0tf)9v2=)uYtCJT3PK5E2y?Ka9u0dqfvxZV~MuAVHsXJK;_3ARQ!%l3JBvGLqx_ zreq1>dECdJ2vvABjPA8FeS0}zAA3CP`e|?;Hl0y+e@@%#J_gR(a#&O2Xdam6*Ae-X zF;^66mq#B!%;-tyIf6bdcbV^D(b`}R_9)eMY@w@3zAiO7um!2qF0o!By=J>)dUxr` z-Vo8{)R%7$?dzotKncN2*98&oq^Q%9>RC!B2t`uYAgk}4oKR3Fal|%k;-h_*s<76Wfac19>UaAZ~3h}ArzLW?0vf9=3268Y2Bes zV>6m6}d97G|Hat!YU!bi7x*d2qh!6mqTCvpkvzQH&ZgqP+Y`tX*eGw9u^}9%M~YS0 zmB2XIK{LhmZ-tCiM~(lOmaJcsVAlz_PGPJWRID`BwkW@pJss&Vh<#p9G_?I*mu*P} zLp~;_xt+Tc8&rhtT^48{=*2?oq!$HLj5goEW)p=YkSh)Q(-d?qu{q zYz?-?Pbm|JE$$bh{b9+fk5bc^itoX*Sc1&U<^^sQDVLWn_SE%Dl$%=T`jl%4dX}qd zGd|W`?P(6JGtDv;PY<@lT~B`Ns3fh@a`ege-DWz8c_#IEai-ZAV3lD4?wM^93IMvd zzzoIH*H0OQEMcIg;W4B=f#-J<6ML;X9#DC)iL-8MYUTse5Djow9vWDj3V? zQ=4dLwT`hx9bfQD9iHl>os+ZjX1cwj-0Xp~p8}MWBX0QYN9(WMA3%CZkUw-|s z1G4+ew|@<&0>%A%HFy=+2L2y-2#^ne&id<|{|Go490R_BEaGCk2xRj=59|i3 zz-Q47$Z!AYpblza3pg9x6MP&!!SjI5`JVuOiH_g{;1%HI-~#YCuo;{Q?hU?)&fs+* z-W~8@{=O@?3%D9x!Jh!>4ju#^2)>MN;7j23U>vLgKc(;0|9=C118x9c2cH1lw*_d< zrmGOnSWJ)BvvdTr(C2*o5$(%o=dTrrR8aS>>1EcTZT$(yIDt#IB3qbesH@I0o2I)C zn6rvrtBE+U3PV?<`>VeTtqQ!2`tZEgY;%H*HCm~?C&XDLy;0cJ2ji=Jgf`QevDDnCGrv5WtYT`gMaB-RKEm&Zp z(IRlwwy`E7eIuHanez!{8_XBR+m_T}5DD1VPH&GF;%QH!6f%#M{w^M7n1cQDr6(CvnR#*27fl-rLsGnz1AGE1VV4 zUKV2M`>Y|59a2PyGg~cdHdrnMacmQit25JP&vkj$SrzxFVd9vG>-7n5#4~}xx2?CC zuOy?M%{=@oaO%xXVOK6MVbjc5Lqm0XGFq1ZhBTDYtrI*EGH>Yq5)%Y4QO|f0&*K3hr9O7yfUBB1xqhSZXUNUp6*Rtuk zWHNs0L*+`U-&u8t%P5oIky%Z?L=V^Yzz!gVoW!Hc4wUpCCMtIf>g84G^v0zr2Aq|R zHCAF$50}I&Enzm8TWb>$fg_HIb86yPkWE}T1Tdg?*OGf}dUI%}x?Q%4=GZ1GQSk<> ziquQ3AtLDP9Nwjp|942tL=KhwKd!&|ybn45Y7k=pK9#?p02Bk@D4_HI*MWZk9|C_4 z=D==nBDg2`BJ%&I!0W(k!5lajoB|#N?g8!&6eI9DbOHOp1>h_Ybp#LO?{A|Ecpi8z zSO;!E{{K4oEAV>oI&cX%4jc<~F5u1JwV(y2!4!BHcn^AjtH7mT5$Jrt?}JYu|Nl97 z5qK&%8r%uo5hw=W+rUe~3&4{>8T=6W|MTEqz^lM3!C!*Mf_Nt2_2>blA9yKv4EQNJ zfNQ}u;O*c%a6C8;JOaqC;KShW!JELj;344lKx6ck+mw1URym2Wp($=o1IsA{EMZAM z_1SC}5S`zcm0`tC^XT!AGqAS9b;@fXm1_t)%&zV8*=a~MoK~?HGCRM$Rm+@CF-^Iy z!CG)te9%}Jo8XDNHfauYm6UG`*jCjaemwkU3yogEeJ@Uh}$8@a>H?#**|v-qDBoF6r%7*WBokyqs&Uc|Yu*koN=bUAQdbgoeRV*QS{G5p={#W|l7{ zOg#dW6ormosVd7^49;UF0xX?M&aaHuN{@_TD=l5lyQN1uv!N=bd)-()iD0c@ttA`% zZWB{#VG0(^MCrzXl>t05B>$g=XnD14&B*^3@JpY6LdO3T_#}wu03`b#4ZenK|8(#e zp!5D`fS(}4zY#nhtOkz&pGS88H}E;|0`Mp>0=|a~|4OhE+y?vrnf?1fasFQlP6NLS z?gSLy|Lx#q;DuldI2)V=bk6^E$nLKNF9AEiPmtHI0UrSG1WoW5AUprBk=cI*ehIDu zyMbc)9|d$iU<}+3+y&ej+y-2Q%zhE5fR*5Z;8V!y?*Z=wdx6gOkAn-q?ZFMm>E8iw z28zYM0qA^y?)lRhfq$c~)!$cww}HGsuYu-pX>$Ve$of3{K5#;tEkpH;6y7C&*UP#G z%mc7D6QFTMTf_5t=?j@K=v$jd`cvgRISSgX(*iPmlOHAr%!r|CiMaQ| z%nuv=xSEkwxg;v|Ls6{IiRR_kS1ilWw7BvPh9vZ&T?e%&gU`dQ>vAr5me*62v0GK+ z{g&ZB#A}FgYQWEiaP%bA@=ia;Z?iIU>ZV|l30Mk9)o%dxd2j@0QY6dVI2TEMLRpPKb`kw#xS}3oz9TULde?*O+r%}(dKhTFWoCQB zVIcPd68f&z_NMiNlKEbGw&ho1s?|@bbJH%zC24!SBAGKy93AR17UY?+7KbYA=n@;R zPEp7-+-~Wu0R!wng=zlK3ZI!cub2ulV{a=|{W<8Fbs;>W+8O7x{q(htez6ag-S>J~TEC?KC z|6h$5_gUmn$^T>g(&snG`F{@|33o7 z{r@nS0=ET6fvb`K{|vkkJPVu-WZ!=$*aJ3!jo<|EUF84If=>a(_iuw1(7k_m1^5eSbehC-6z|cJK_)0Ox>XfMNkkU+~Z1)8OyGJU9hNSMWa2 z0qen?z$eiMJQF+x{2`bH=Y!jU8_)~re!qFp2D`x-;Ctu;6vzKt;I-hX;Bnv-pjd!! z18)TlupO)i$AAZdyMwy{jg52$T2lJROwpO&e=69O zln*HtouGrlekw}`n}#wW4D@xcY|_J4g>3wgRWTdLoP3EId-lw7P+bH*4PKCDudo4? zA=*ivVdykgt2~r|sxrmVtWIpq(CQ&#T%B}+kA#LiJKmWL?BLoo@o!ORGMxTRM^Pk* ze=;-G;6}ERa!-debc}9KFpqV8tIy@S5Vd@sF*p3PLmnRf-{o9tJm+!5l}!KWw89v! zAtk+;t9kd6q4n9ecrb z2gx88NI{j@>F1J|BxzhPVCW#olCt7k3BwlT{YEX>%qeFqy=enbMNWh z{yj*pbl{1Pv<^J>_#@eYXFyKXg_CDkC;G3m{MT;(wS!;XeXCVk>0BCmwsUFhSktjF zHoiK-K@umRNjM_4-XD>$-yNQ*BI%ZM5(NI*Zd>CH0*>7`!^v?qC&M0;D|>W*LiQ?? zg}qI*sTRyFOgdDx*;m4@Q&Lv(b4c>l3^$n1Np96zlLj|%W}&}tAyIaS;4;Q8$>G#| z7=}wmn|A+8xYyAn*i@@c^d(ub0^=ha?_?W~X?7>{+w*pHdsP3>`?L5~azRW<-I)>R z8s)w7C==Oo8)J4BJ?SyiaHk#S6F4JiYi4`gh5kBgm+!)8F0a$zyg-JnBoGJ3@z;mB zrO8|b{$>7_55S**X9LOn^6#&JmEd+jF#$i1ysta{o&p{ZE&$R4d=okU zMc_H$*&ylwHuCqw!7q^UuLkc0?*R)yegSs?A3(nU3-Dr~cmSKg4aoNI0&fL>44wjZ zfMM_*Wcory`kaFvBWibb!}Qp$>B%$WPB5Zt ze-s&bq&zv%A}Gm)x(h~^qbt%$$`z_ADokEiAt(Q3>_z4xlbY&Pu??uX>L4=rH+tqC z8rhz$c_QK)Ug+%2MFSRKw>pS{d5LLb-+R2?9+T$W$8Otdy;|F3GmoC$YEv%X%b7N9v1w6?bOgBPT)+M!bJ8W%9kYurkbot1I*`4me^L(b(7BPT#9sltHj*!r?Wy| z>0?!ouQQMB|B`)``H{@#Ov>b0S8CNOlTiP5r3rI6lPo;RrDUrtCWO@uCoMDe^J@24 zhQ%eTcan@6lR#(r-;Ye+0E+o9|Nn=A|3;42nf`wTI@7-u+z%-J|5w15fn@x@27e4@z@xzm za69ll%1MU*CrB zKEeM_0_(t9Fa$owcOL{AOMTciM)!5fm92!upKVNCI5e}!?NnxEV|EQ)H0;R+Ntj-o zU5YR1_a;|Zn~Rc}r^AVukjA2&3~^*NMw+GAMPdOpXYAN?Q*rD4}yrzq6BmvN6() ztuhiT(jtEa0yuCl#Vb_2o9u|ZDX?Ru(w+)Z-;zDmqk}RozII!=U8{o%|B7ShJJR3# zs0^1Cqg#4@?mv*Lx-Oj3^MS|NHgx7Ta4@t9-$Ey78mz3{-?3cjW=$m0!kPL(T8 zSljB*hw{IO5f;=t=)aipRSWabq=R{b*!R}g z7q|CtZfv}~*q&!{5e0m5hAF9t%d<%D`|T|Z70o^Kem-C~hL|0;w0qXwtDWl;Do%Tp zzhaj&Q9Fcp4$P}*zzT|lCIP+zudTY26~_spnlQH33VH&kzRJ13FqoUziU3uDU{O!@wqc( zN0W%_{&0|F$;NrpvMGb~ex~0~l2Z;al3CsQS@I?<07~M5km{aIZw18-dVNU;$J@%F>RaK0|fsbSU*R32z4)SsR_1 zNx!qvNP=oIlP^n+$$sks>=HrZJd^&d&SsWPm1+0-b-ik0rfvTs8FI58FEN5i zqkSNQzALkqrKeN)C%o>tZKgSm>TwGjMWQv<@OzWtBoE5`z9QKHt>U&S7CPm6lAG7U z|E;W4`70qLBEBBz#=ohWGE$moP3w~Cqfa3hy*@#8sm;w}Be)QwsIDTNOD^Y)s^TZKa(cCr^*BFr^6>bObo;uJ@v@o8 zv5`~Pt;J&q4IuwhBmO_OdffPdGItBeJI(1-Fr3Bwc#jpQY_Q{uw_*1u#|k1L@8y$t zMm0N>GIY^}L;GP`MAD)DYs!^%YmYrXcnPHhJt;00q#R4}Y5cvyL)M-T|b;8)_ z+p`I|vcWw=+p}Sli`lIVicq`fW+fT{@|0hfU!)lsQwCnbIiy+nZI-a^?7O03It?ks z9DDs6MI3f6p@B%FykpyV=1s%HE%b?aBPMrWUsa!Y7inB8YSSjBYv zHjRl#o2kyYMKX+S7w>PHiG1a>-N2UWpfy`oQadlxiQEG14AH}Bx0j*U=mYJ>3q3MS zk|!Bjn@l4Cm)XSVj>5s}qYQ1&>(R`W-enriP@yE7T%qzkZnsi7AonGM=JKc}M0lTx zfLypQ&pr6KMA^=-x)sB&+u(OgF>)r%IXZ|v7qh+VSh#HfraO_aRVeg){Oa1-oOaDq`B*3Y zUKu^(R}L#zn9ES(E&u%e$=@ZF9Pnw-Ms9 z<#K@j-~&g$_ZrS(XD}X8Z;Z`5sS6hy=JHDxt6f-U*=iW|TJlwN9i6rz#4P6QZ-x}x{W2bVNW{(X3Q)Z5uQnT=>ACucPms9vZ< zu3~*>w6y-CjxZQul!@abGfMQ z+_GhiR9v=kE55MB1xKS4qN>?hs#jw>j2y&&a*bM%)>BnUkf3N{;R=7 z)taegB1Jc{o^E^><(Eg83!QRUL7|8=Q7Ae32X}2;CuOtwBO_(E>bSaQw+2aT zN5h9phB^<;89aknE;(_qhe!=>s!sJ|_L|jM6wrjMsd`>0Ja+l5mdVymhi6@$y(D&X zWn>Y@3enC?Q5fGIK$?~-v|`{%<(J>*t2LI{M603t)oMsv$$ypJFVu8RVYir-_wV$E zCd_irk}Hw7ke=L>kiJv1DP|OKxO`+4ovV8443pd6GrB{cK+9?eGe!AELk}!=syx;@ zaUCDSfjvz(3(nF(uDi9>87e+f-frg~Fa?rC{yCX6iGr|Cn*KUp>*-2R{6cY=cNV*6 ziE&B$w4hpcvYQugm(DKeXp=$enIUp*<|6MTt#qM5x0}`OsSUQ>xTVa^2vAv@<~~PQ zQPH#(AEcxsGgjpav^Q(nE1x6lXG2rR;{LIs>Y4QQk_wNXO^0G$4HvJLWy*3!vlR2w zZ=%4u6&O2aOzOs?Vb12xj_%zxIy*T-oux`clL|=?$reLv)*z#FX1F!^zs9)#GRXf8 z{?_LzWd1t17q}Mrei0lGevK^uL+~N+TJStD4~_*tM3(;n_!@XG_#5y%@Lcd5Ai4is z@JnR+Uw~JGSAjnV$Ae+;0PtgE`|H6Mz~{kBz#o7HI2Ak)JOJDud;z)t8t`gx1-KAA z9$WyV6Zj8w0N)1R0@s2CZ~~BC;C`d`-6R@`z-rDS7pY zO|Nh53TClFwegm@D^>L5=<>|mEJihGU=8iINTF@%yG)+YBl~03qe{vn`w7b#bDRV@ zzj8d;Q@Dv8Q}3Q;Ug4K3os0=VbuP<7Y>4foA;y}d7_855ItVmuj0xvqj^w3mCG@v9 z!{|-YG*p3^20=oCO8f{mu26x4ZJoVtt^!>RR3C(|jrnUC6OxX*IspUsUWFa#v}+oU ziudjgo<_Q>$Lx4>Xt3#Dm+IJ0wA{bh`W9%(9R4;2W*9=IkIrn6bwT+^B)g#QvljKYWti?WufzSW*@Ahmvg zqCB>UP9dB+3`Pn`E`dX~P1tUosnsf&OU6+p=X=g}|KQ;or&D;Aoxb)cYC=@qEL)e< zIabQG@G@Q7TG3wVOsd}6F*QFip|g$~5*1@j&y#;uOiA6MPuR>=_o|zC8|*J7eWk`F zMO*7)1@&#=4e8rsc_n?{qid1`3X03lR!tbauKFnlb;J3yijF*6mggs9*@%~)w{f6J8XC?a*XwZKtQu|>uY}MRyz9?Q(36cP zeaR+uJr%DuDDS%SH}(3C3wCecx$$i4x7voSTeoklZQOqD=6b!VI3fg-E{A&mz! zC*P%qQmxzQn(|~wYYLy4bc6nr_NxQOyML7xxI}_vRazU>&hhHPEZ%#SAPrTbEENLL zm=xE>p{b&UDQzU)WD7&9)W-32^NMe=XfiH1RtTJN@+}*-EaoGjn#g`p2s8U&QpWlr z4pwv4sM^^-f%{r~f4yGLE%v1LQRcnL>mLes1Lwgv0B5d!Fb8Av(=!$1y>S;$diVOC zT(|3_V6m^W>pPBDSzBFBK*+Ha+T7UgJoZ~P?{BGEB}HKPQ&oq2p=UNfW5s-v#N`Op zqoxZRP`LEaXk6G1<3Qrw)m(u7A=o1;80BowPOJ$6IbL8+&VzEm3mz+n(?*dSs{4y! z4^8SReSJm^LE9R>sn~2(4;dlPwJkGlx#zLX$()OlV)W|q!gobYijQ7)6m~-=;@8<~ zxv9dc0~^-1h3A_)+rNz_uPU$`gDd}HZ$?Y^b#dqH!g=TUWRtZ~rUn5K-d zMy_j1q6O!&VCtYX?^a8Sd+dJN!|IUBO$y0a6Ip=^70IPdwN=?B#lKO4hdz&bCMoS? z3Cd*>lgHmHq!kWf7)INuE2pH$!nmo>SZqmb33Cm0vs@-CV^J$T8#hKJ1Q`0>QeTy}Q@baU16N{!x1^cA_>l5D~s80UiF!xg)c!b+)*d8Q$THls$yw~;NuZMe(i zD{`Yr932uExJB)$nyBUZnb?VQZIk~A8)UKOWdEO#@XzNJ$o+pBbO3kY_dAgNp9?0z zL%@T4GJm+b#cupOKWHi7>|rvEhfC-4#QVekqd+5Z682Tld|0a@8!F#r#M)nFJ5fo~z- zzZ&cWy65j>$nqZr*MJXz{a`DQyssF54*;J5-U123USMLo62iiYa5L@1Gt?C3uA(0!Gu}jQzZg2Uql01O(Ct!f(5}(F;$xDAWK<2qL7x>e ztJK0DiDXgc6P9=UF6;r-;J=_@Sc1v~5i8^yHwVq*-23^J98$;SD{?qF%Hhnt?Xh&y zcC8rIr0+7(mm}ra2DML3%dLXYK}L6s6fq5)K-pBY7xa>~IJ18yeMV=8zvG~i0aI$4 z5HV5piY`&J%`(f(5G$d)y~XUuuTJX(i}Ex~zb}+13HShe&-wbxOocEotehzYPiLm3 zW{5BhD#jF4bZKXAA@lrPh+BQ^vhaU0GvchsEvVue$?E(nL3b19fVy%pCLr$r()J)|}vFnND%fMkj+@+&-(K@%l6QpF_dL~KB_-JvYJ*D;U9`WX#9ll64TdpGOt@tcZ z2-lr&biLk&Si2aIfg2r@THI`Xqe0LosY$$jB*Ph^KF+JPGpT)oORb2Qr70IC8XJMn z&oIYJDg;`cnU$GbbgNclO0fgwB!C1Mzw}Y;|M!8v0Zni+_zv>^%fL8jf{TD+|E&VoA@BbacnSCu@Dw1w zfOVh*{tKD^v*0P<4?zn&0mvUfcL1CK9s%wEK8O7OR`3>}djg*gnqV_H6XZv!s{3*a$8eggLZKSC$)dGHSKG@vsC zTYz*1iXHGvbOhf9p8{8cSAnO4S#TaW3EUq@m+;@{3cd{f9efd758epmM{p825u5<- z4t|K<;BUbsSP6~-iYfRG@Q2_suo?`5JAm(?C-^jYJMKLGo{B#8GB?%;358@wmD z2e>=T=aLUTNG8oPbjA1#U)co?#C^Wv!|cE=xDiZL58&u*}QymdnUPJCGRJXNxSkWT<+aBUHrn!lFw%7+I**U zeZ5qJ;^4q}a@&Q`l%ejn^h%X|)~5y8Br)@hNhm(@-kl}>m7PD1t-@Mpep)e^W*n-y z<&p(0ODZQ4nL#cbF`Mr1P10ZDd+R@5uPIM+{g%ziHl1p$EH>%5@p!8xhJ`P2P>4q- z^f3NYaLuli{RPGE)fzeCV#pELHxpn ztmHp8v|tWOpTxL41lN(cYYq-GxH;)wMQqr3_qeaASa+jmelILZ|lFqK&DGhY2Sgm|28gz(o&6tU5WCatnj7D@)}u z2w&@E^k6TR$~di}&p7RlT|qJwUm|oX61aGMJCEKs)`w@BT?Oz`61vmgYK(haj zk?r3P#=yhD3UFWWG35AnfwzJ`2a5l91~>-%47vS-;4i_YpaCl2Tgd1i1XqDS16#n^ z;4DxEF&5y9k=1vA_23jB`TYmT=idik0WSnk2J-cP6c_;~gWn>TUj<$XnqUl!f>XiW zz)|4y$m9P8E(L4A=a9wU4Xy;T_dgEY4d|}Fj|1KJHw~Tu&IAttKSI|27*HI*^T2ID z349FM`lH|A03s%Hl)JOC62hXgUrQ#O#5l@Q zca?B_%a>BrlZOA$QDMHxS%EOm<)gZw{zR!C{97`*f6d0T(4dALJTf3g#^nTL?1`(S zmtjT%iCfCmK&r}de6G33!bwcnVx#LAToDTv&Utg-gDSB;(UD=5zo+itvUcR4$2O+s zJx)kvd}vDx#Rmtfb|>Bf+s~}xqKuT+Ua>AnS&+GyjH{Z~%BLs! z|J3rj)S1Bx-pv+evC%6@vD&q)r52}tr zc2ir!K=%*rl&^rly!VdrURWt2)mH%2>O~O8oa)+5QOir-}wn*MXr;DLktqL*Cw{*5!WbD5zYV6 zRt2fqN-)M+#=E;-j1^CYkkEp}!AXwIOhd6IXQl{QRcMW}iaivmlPr}}lLY-KQ*015 zV*P1orZMqmKDo{}5{d|+tNOAQG1|2}F)9Ssi}i^#VhIXVl5%kw*{c$0lZK=~pTwDM zEV`b%s8z8M@ny)$%1by>C@MR(?@W&-NXNubof=nh!f}42(?T{`T!zCIVw|!X&@Iko z>yVsxHP4Nrb!de*lxkLA6pQld~&w)%v6 z*vU>#fQ6M5PV;MuVd)m}NnEVcPAuvD%#IhA5rS7-d5|i9S+PI-R9YscW1y%q`nvD0 z9V(J%?LNvw2E%-3g>19ED^7jBGoSTvy6K86J@z@2k=L!d^~^HZWc{Bs)a|!q>1OBt zQ^sa^_XH^nXWW0)HT`wxT~l2X!)jXbLx0{KJ~{ksyD6kifbm`nOhK6cDlXlr^xu%E z1CF?ggNG&DGfejzOH$ADoGZ`k+MMg~JauZ146Wn#=4FD^$5XuM3t2?QAladlfh_LN z>5Ii}zNE^qvW@0rUG`3|oHL1| zpMbv<`#cupPerX06VXk{P-OXso0w{7#I#f*4<33#;V|^Lz$C~0EG1|meQ+$XX}!zF z!=tG5VY&USs`hjGJj=_pY)DiqeX`#qRkV#DP#oSq65PMa*7}Xaw9CofsA(XeF=%Cm zfDS~^oZvvwR2n<kPn)z*E6AI2}9?NEh%; z@C~4tfYJx7209P$AfS5zu0;p%q@WM@5i`o9Lx0>^@1Aj7``OaaCAUj-`Qf#3l^_V`x- zo#TH5_&&1y_rMpy_27-*GH?pG4Jd&tk?H>$ya8MUjtBpXJpVed6C4M=gbW|$`&s^e zFi@Yi!FBw7DH!=3n?;=6YD^)qlsAlx;T1$ImnpH=9L5vxF9#X6FQ0mI9I%(WBNM6# zN7#RF%{lxBNz0U~dz(`;OjkR3AJ`n0kA)$!lin7EN&&41O3uuUhu$0bQd?!Ab>_$E zE4;(;k+2B1HCiZZC5=M(%*x=t#W0_+wJ>iGXFMG{<+1jt39VLKIET6$av8XaxRfBG z!E1Fic}kG@rIL^)nPCBnKH=FN%4Y6#Be-To@parOh^eub6hs>H2K8MwijC7LNR(BDWVd;TsY>bd7{3lKbWYqK_pKqZcW9n!D4(DYCX;& zVTQY2*VuVS#=24}E~Zvt^X=ziNSA(49>B2M0@D1@4ou7;51nLjkU8wRAQ2Zg)!-G! z>32dsbj)m^Q2ay$OnSmNcFhZq31c?^xQbz{DNjw5Lp0S~_xx&=bfm)|HphCtPJouB zU)QxSdNYQTabY|<>K6+O%$Z&Ye+KEj_=2@;o4R|{ug8pQq=E^J#M=o9vj^Vx*tP3U zMm@y(vo(H7naF8zQA(p7maN9nVu{(gAQf+}%nCT|*MArbq!4+Tm(6Tq6)BgOF7{Mz zpYL0)E%l|!wFEuORkf*;bys_ugXK)KOvN)7{08bMwK&IKtkOI9JKJ}gozs)QL~*9s zph0MvfO}@!9B_p0EigmXH&U|1EOCIGCJv#LDaAGq8&)89UuSz<>hGNQ^ zQNWs=UaVe5yN1)Q8BRy%K=raPH-(Jl+?XdR;Z%oT ziXe%@h(wFqEG+DeM3sqVQ1yzP3cO){^T-N4jojgQ^p-k_V&r?+?@K|wYZZ}Nac7Ln zkvZoN#ps8!qFzkObr}^Kn&0;(TuYFj6ix1LNCQ zVVct&sJr87jN)m`p881IpT_MMGcbxHJExrBsjv-K2h&$t533PzUM2BSv;&8jEYY<=Y$>6ew~tAT4q@ll%@3+ zrwJw-tGRbOP=aJ3D3p*LHsdIhcErq)Gy23s0;|6W#xn&sWe0)88%Eo}WXlTl*=DYz z+Q11Zrs&9Ap93Rro@%kD!j47&0Xid5%myi?ny?(OA6;xqX_w-K22)Qk9B6Kmq z)rvu$zuE0%VXmZ>2|Cyejw z0z4jE07~G^$oCh3c<%p;$nj4HitE1%YylIT#0z18YDP{4etUXTS%5?*5bA{{`Srf$aWYN8bN1_*-xQYzOCp zZQwV^{;vm$|9>t}48MN{p9UWWmxG6a?;-2|C-@ThQ*Z#t=C62uw*e*aTjc+DgIVw- z@c+OMk@tT96yxt@KsNq6gWm@U3S7GFR4Kpg(J~3b`@**8Djye1)a7+`HXv%S)JT^kZ@ zq$BmL(Z$y!N_ciO0?5(XVKb?gcaXj1LbQ%Hfv`F=ZRnoMsM#$R%b|&_UZ3zrGz10q zqvww6d#V_?;8dHf%z)eL^0d+N-vBYjd@HX)CzQJ~Tlj!fGIb@fJE8oo++LjMlDx|` zV9T$BTPJwm2`+t?ZPcm0#q=_+33GjqcePe&T_J?qrbTB}XpgRk+o^4hSjbqrQLon` zAxAHsiebguNDSw_h`9cg0~xcH$N?jbH`tZb^& zYBMoFAY9`a&+&LQm(V=wUA>x|{5cc_cUpn8!f~=4l-@6?O`1<<&c=pE#u18&%))4&8=4k0u6_GBz;O*?#}ua}Z-sLN1m6f0JMO zd=Hubo8WC=0qh6!;L+fx$o@Y8ZvcM?{s3GI6dUkZ@So@e-Ugln9t$1}zKR~;{Xp>m zF9nNW0*r&>!R^3@(F;5aJQJ)3Rj?ZTJGy{(g2#hLfe|3xz;A$J06qigJiz_I{lLE< z`)Bn6FXjJP@M!Qr@ZZS)H-PKG7r%dyD2K);BfbIZz4LA!(U+^>Z1MdS@gZBcR;okzz2D(E~=kwnK zo(ld5JOyZcu0&T*=l6YnM?01)&Y}znGc#)gYc6Zz5=@ZPP)ECBMtPa2*d%3gjI2Ft z+ezTM3i?&;twsL=aRge%bG6JN%7hpX$odT^f=ZlL`P)oU=NxoihCDnHJ!(vvV}%@X zv0h~r$fryTX}3y6;c*1AH5yc{X+x6{tCsG*8!ZF+2Hr>TTdrF()jU$t-?F`(mHxWw zJ7}lt#qK$3OZCV|YF#y5R$S|PqbRePFZ|6fk}Y3P-v-sW`nbq(t0yx;u{5lY`2wxoVgM-A*o>k(Qxqn59UqJy5cKGIjq-es&Vx)tNk#I9;Qaa)~mw|LDm@~Q(nqirY!2f zrP|Ab6kD>@v)SY8F6}NAIx_Xa@^_K*5aT0@a9(S+IYB>1!?gE=w2?_~6g~>-xHq^HxCZ%N=l|D& zHQ-_3I^_6gf|J2rz->W1_x~>B_$Pq}fd_(5Aj4k))`71gvwsSF5_}LW0@?q+i=6%) za6NcFkpI8V1ysOF@cTf0{vJ@}?=pz~00W8PnhFCYY!gJ?1j`^uMj|$#kdC7n%w#E3 zT*$}FCn1w)AgMAXE2%Xx&?e|aRXYpjm$iApUAg}MPV;`t_HID!1LF;MI@NB^_>K|1 z0Cjmti@l`L;R0Uq$FTc&DkNk)DkSWtC|L*H=nFUJ-diY@8YK;x(J0Gc3AIy}Ww3p| zc}YX@dhDpbIP(P$Om69L(4XK`SGAWx(n=$46$+8;)YDl8&NJtuoS%^#Z}x*%VTrg_ zymxE4a1EzA^@au7vA*l!3aJDGYSQQ@zh6d1yGA*CHY|8Urh&(j&cA7wW8g-5ugHV) zifnO8S_Tk}ghjN*Zi7E9%EkdbBF#z3rj@4Fe9KPb`9Q}J?&0tkXRUe|0mKY_930Z! z&PiUo?HCVnBPRFhiuHW>h`?rbh|NzF;#N89>Bvb4%?V{{nf0(2s;)QAfOpF#BN-=P z&HTvTFQY?Ve&)M+^~TToK189gAKB=ohV;$zx_bQl#;g{(2YJ&t4mta2S$4{6sN5Q- zjA0hqPFD@l@9c%lVQH^OMhhOiR$JWpV3H(jL-$rnV6FD4HQ_a_rZK=;ReyMNt((zJ zs$WM~Rn-l)zT)PBdwOhe52uno4aY>E?R(3a#TNQuBxtczmOlTN*IOUudDpzkyBY?* z%pCR)%@X9?SJ9n1LtXprVSVQPfZJFP(@&o%atm0|3AM`1@mm*lUut&iqK-FBF2AUg zsdc#5bsr0F7dtBXpUB>&j|KU^&fogH99jQmKz9J14Ze?@e-(H)cq35k|1Dqx_z7~p z;sE{+cn8=D&H=wfzE}J|#rfA6|J#D2z|WEIb@$(SpbQ=g?g>7OTrWHS8$liDJirsd zV}NY^Uk6_WuLYyvWbjCE5;z)s8rlAD!8^eW&^>>@LAL)j_!ZE7e}4vE2+jtI3m{v+ zVgq~%d0%$^cLK%!TLhPYhk~dFXd(Nb1vUf42lyZ4|CfL>_GW4cl0}e zUem(vFQ2d!ljyhJu5~Ugui@P7+GCH;eW%C0o(I>$_2pJ`ff&Macg($~eD`qHd0*}+ zyLK$>l=4X@K9Z1TDlzpv)DDy8C|=z2ZgwtQHxlkmQr1cF$=7SsD@@}v^kNqdodHh= zPj$}Z8Tz%&f1TyOcKfd#{PMTWxsf_dH|A>cxv6w6CETlb5B`tV7UKX5uA;p*ro*e7 zQ=|1UGGgwJVj`oPW2!fC+HuHdJ!pu|r>G!609}${r3wr>aGw zW-aXH7AyIY$tTU`Ao6(jl@PDIq^#oSka)Nm?gpPz1UgNXtT^KReG55Pf;-*Vlr%4e z;0c$EHfMDDEnKE(Tyv_r2zTWYy=`WSgMQdaC)?tR zzPxvy`M??S#+aQTPkPL3H>vy8%C{jdul6!OAOaJG-elU5Ov!GEdsKC9FyFy(1%Bl;a6{QoF?9C(;b#B^h5OwbDpV)msQv07AUKT(7=tM1d_6oS$$rKJyA#aKRu1FUV%XpK?`Ir zbZtz0W4g_kTFfcaW&XvH@2+e)19VlPzJqj=?WKi2a=Srn_N``vL}3*4Zo$k~Pjyy- z(+IKT!hg_nNK$T-E(E2kmUr4#?MJmThayvJK~Eb&P-;aAn1nFtVIh#?saYUwiHHm` z;{AM6jV=Z+fj&lpawMBx=fhwrVBMlDvFa^vFUYsNx3R#;dl%hS(;+|R1eMX7E~{}_ zUWuC0&$wg`hJyB@T?e%&moy%vY-w;;fKDD^2KJD6%7pQ`Wiyq_>)~(Qe3JbC=WJ&u zB74gIzlUG?{1%!23*bud67VE&4mbha7yJsj|103V;630kK@I#qkT1Yrfs4RO@IT1> zSAtnE1@-~y1U7&lBKKbhbRR$sTmVi5!{Bqs{r?O;4W0*dpWhka_krR9`~`S1I0r}< zuoB!C90h)k?Ef>M*Z_Y7o&t7)%KyjSn}El4mG}QcQ?^ndv}K212N~*dB9E8E2@!FE zXU>-aOtO7rW-~So- zBA5iX20w+@{|WdU_-n8WYzG&BTY&$B_x~z*C%6_o6FeMj0e1z83GiFsMezUofaC$P z_pgJs;J3h+=zrP#%T8bsJOXG;UMsmw@?kYWcbLGpKgxf0e%e%;!S_Byr}Sm)O%t=J zI}@~Sk(?<};{{7QIXMpI1l{4P`rL%N>HmQ%2DtP%aK$ClR}9oI!?v;pAJxM1n6u@z zlTI3&!qi0<5<|NDW@G$j{~@$1uuvmJn?qMq9auZCMnTvPSJY!-9OLy#+v|!KC}Sq1 zKQho57$TT5Iryo{fBwf&-#D#m(bRZg^8oUU(>Ja0?^IN-ZnA<0j(8(r4Psc*7BOHm z!QYyr)hmc7VDbm0G4nqUTRO^jiPp)ps~+*=(&CN|ow;#C>HV-db6L)PFO>6_5*+yk zKkb{mBMa@yn`8B%U#!>myO>3TV9x_TrkZ69>XA14E=h07`0i}=I7t;6(Id}z2wj?2 zHB?SFdgb-5A^#54lxwgR)li>8Bbg#$@bs0YNdBV2y;~+?5zIXv@*&1Y;wk^8qnpKn zZ4Y9iTpU#!$S#RRTS3jJUEv;^t-R!mhL3;4OiS0~R9=!z=tId73ON;@_sXcdx^yS; zx76tPDNb5FMbem0%4`6WiE9!-%stB!ES{&3*U41yD+n~ZnGy*KclycGhe-#FF$>M>`zLAKwXOG$qR8f|!F8kUS=67=-aYLZ5du5Zmk1S>59yR3*GtGu6sLQCSe6OOc^~tlR z1^dkkkUW#cy(Zrt3+wQ>Z^QYJhK{M?wAlQhdQGdGxLfCEs~2^2?}dMMYeM^fS5)`W zz%6#GfN%8AHth;%6e@bVQ1k95IX$4ng^C<$44rY*?~3f&wX;JQA>E6RFP5P*H*7?s z!*;DMK(LpifL^<;)lVXoq1p0>9baagRano0TsG|QiJQcf{VW(z>kqo zs@u=49&uWTDxXgA1Kz@7ju}a$F0ijOA;Ln>=RmFG)l?IZ#g_U(wFc|B>vgOtch-T& zh1AR9u_Hg6IkjB&Wq0VQcemidnv2(c6lh7z9@5^+oE$7)&FEY;b>OP^OAU7UN>XPj zqzd#vY@o}?ltrQ0{bUflE95!s$s8`M{JM8p{%+{FK3?>miuW&)&SP;c_xeiYOAx|B zuZ&=-tXUaVqhWDqqUfWur9m3kd9Ghyv8UmOJvXIi>5J?{%2kww7xn(qN>62r)^pQ( z(|WE?Jg(@(%CkHkTFgol3~_%rY*@JMh5_%uBGr@)87Bf%d5#r1m+(7pY~ zfrkOb0JtBx9r$N>_A9{!;C!$RtO6&3AJNC30P5>cf_H(pg1^%g);!XceVi6)rO4e5 zjx^v1%%*j-sm})cPFN{tCp`#PCUu%@-ry8YsRYG$@-IC6)2OTN22FbutK$o`;aAfZ z8YwG;g+;$>f2SI@!9zk|}jQ3eqRokXiZBCj` zIor7x2#=EG=C$yDYpYd})oX^0$#aWC|E6x?U1KmCYlT`;%0+K?g;2G`5JVwuMvCkG zR6%)6*3|t=O0D=CW|(YYXQZ2yBV3DIlJ$m1TWF0;jNyni7v!pKH@)Cg^)yGoamX<~ z=vS|ih|1K?@yYf9Ts(?HayeLgj(-#Bytd_{5huDIH8v20d7N|;qeEwHSkIw{0Iwt2&=mh~te-M=w2C|Z zykUV-Q(pS;9z>b4;p*5}d;b6rO1v9c6_bRuZ?{7RF5N$Hmvp9WtYO<~$=NcIRm9`a6d+cO;TzRzHx7t=GPR5EV zp-(o|dS!Bf4B}enEN*9)dPITd)2qd)qtbg_W?o|A=rpb_Gi~hqK*x4ZnhPJibBFb6 z<6N70^z;s!a>ZUw+O*xKMMd@2r??1gzwpFyWR@(UV(*6^aT$bnSDY@M?s`v`P)zwW zS#r88zr&|AuG&NDV|N*z*(vJ05P=pY#y&fKW7*kU$+Ghp^t;`(ZQc0)v*6#}0Us*; zKYAhiNcVpgcp-Qmm;)m~HUKAqTYzuC_kRRPCh!69e()M_2n>UJf$QM^UjyV1Q2qdZ zAN(G8H!^_7gU5j(@E!R6Pl4-zV*kni-^8a8K}Y`27!n$AQO!#{l{DyFa)GH~~n{|3>gk@K@k|U7Low$Pq8qLa>*i#X9T&qncsW%bgOy};)z4$7q&YeIopyU0q3@X^|ga}r{%GqIJ8!qOmLRK2vvAl8;mE)+%V!(hOh=TfMOGd_W6 zL0W11eKVEQ%UnC}k&#JaIL54N`xa;1YjwO#5K9SUk_mkf@$Z1x+iySiNepxz!kWcP z^Fv&3g+A<5L+Vh=JW(h)T26Nrjl4jfbk~}<(V?O0%p$>Crd@ioPd$S*xD8(x2bt!Y z3vd}<(A^k-hsczqPW{%-&W~V^5+kp3Hth9ylBwx(YPRepF&#A^=F+R=PRwMN@$EjC zYY8!XOR4G&?sbe$H&wqz1F|^#uh#p81}Ea=N;IH%S{8&T2-AG3MEO`PP$2SLY+^j~ zKR$AZ$b^YJj&Gp~p~#s4FkEXY!RvP6Y0-|}<#axQmJ9Y;(X2^|8q#`J4=i@-Jk~n# ztQZcerI>!Rd@T9J5EoYbkZU23y*hP>QE$cm(MOpFS9}FnIU9N;u$?ltJ^SPoXUg{o!1^&XCfvK zyGf_pH1{rGMTL>3wUY6s0=8tVES_?~miK|J7U4!Sn^Ly0;UxWV#dapKq@c{lalOIT&uEhz6M$rHm79x9dhR9Mf;_Ee8qm>AAm9CF?C(LJ~; zH_mr-h08}}(1I(=o}PFFa!jK@5q3cgTNG`$dNg;?Yoj%1W64>8sKX;FOD?MTwPQ^L zRJAKNt)K4Dhb+uB(Q$%kdMtpM2a7Wnvlgyio^(3Cr-pVUDY1lYU9cf$WagkSpnC;a^@z|(1?~er2p|6^pbowYzy1Pn8Mp+j2PXpA>wghQuYU>H4=x0& z!5@O3z^nf+_%V>p{#D>07zeilw*@Bv@$k9LlW)NCen<=y!8>)t((+=SM(;!`nwVhy=V~ z0>LV#3&&fi`mn#)>%dNg545dmwexzbhivi{+Llbb!t>bXY|cggaBIDxXOI<8Vh3MCZZ)Z6K2RkkNY>5@cs;%q8uvhv@nWc3gnD2%kxj4Mth^Mi^Uo@mc^ z=$yIcA^atzFRqTWcC|l6x6YsjN(pLuoY{`UO9{T>7{xaV`J{R3Z_g(g%3S7QsjdN9wnTru3uw-zANstj_~HPI}edUdtLIIEFT6mrY+ z@xNBZCUPlNp|9f(E9062t&yhWwn+eGNK%djv1+AScIT_@M(Vg|mB7F)ioi9_dVVH$ z;+(ttjPI|GJFSwb+1^}>Pr^yRE-Rtf61RMncDL#Y(lcro>De-%66TQxf@eJ_CtH=QM;jBVH6XuNW< za_TRrSGiJUl4iB@GMUS4!Wz1KBGO@$q1Py+BOhixGR%@^8HN$yC5X%x3VJOF)khg5V~>h10du3~>< z(4j7ZJ`Vdfb}H{;C&6O!QR#=oD_(sjnVbdy^Q0-YcGs-I9lh+iBOHbYqA}`P* zmbge&gn`dmqN%N}OywEcYt8Nzv5FC~#d)H&FjpixvJI@ev&$K0?NOh^^9{jVI(uvY zm@=#FhSh4Vl<;S@cGiZS=j8GW%VY_H&4S4kDwG^VtMP{%+8ruUs90e(ivPa=4(#*r zrQ-jG`K6Ea|9=ji0-g*Gg8u_P4Zr`FK)(MEfPLVn@cZ)pcP)4axCXoeNFJd30PDf; zg4=-qMjr4H@bBQmK=%UV58yIzNAO1E0dD{g1P=hR0r)ENfVY8zU<3FJ@_;9Sec-pj z3E(@(1zrdK8N3jj4Nd{S1zv`nU{8=2+=kz8M?Ua2@K$gI*Z?Fq_&9QbkAaVZ_kk(U z02LrP!5hHq!Rx?dK-B$j68Ii60LAtDDtH!nIG6zk!8kY@ z$p7CTg4+Tc3mb*(hvovDx3d5`cle0!33FKY^gPr=rh;qiY)$PZENQ<9CxfTG`)dVE zjBxAbV0$#@VLJ1OwK3#xSa|71qoyK#t{VRpA7ht2hM5~*4pl=Hq$jJW9%7{I5B(FC z6(dm-DSf{{=5Mb|`zZ$g|q4hQGrBuT7jCZVKDM&@U2s>d?1B6BgG> zsP3Ak3n8L(xN*Qt!6}GFz$<=WAFLT6E3c5O$wQ%AU87q)*q)UCb6yKOrL+{z9;+Cm zKk3pA(WnZUSA5Sa=2nYc7IN=qR4+Pr%a%+)RFWVOd?EiX2z*lJwZF>+@xFqnPq;;M zW+ozf9^W_6yy=_n$GMgv$w7n8>@L9$2&}LIeLb` zx6{-o8e%k`pJ9$yRLFLh2!$?9a*_YD=A>K&iXwoz(jcsp;p>DcRxnewbr}6FIqB7! zomGTN%UZ*`6~i90&`&46Q9<60q(EPnsxK8~b+r$p^9jT5qlWawsElObvsSZ@6ox<1 zDCE0|ww-$ZHnOzx?66t8C3dAxi^kZrU%uv#{M<#8(ZaY^WFya=WtuZVr!`)E8zy{22PIq&@&%5utn`GkP$ByigFf3F z11%6L&FT^ZqC5*~+_3i>g|pO_NCm>MhizDe1&Tx$b+RIY0VN_LDr|x!s_(5ba>TD! zR7V$)J_Jv*ez=FNr=pB|QFX_`nWVSgYQxQ1|V&mNI*ph<@7~~hdUzOXO(c%P!_`{wOToMT%mfieiN^6mc?o$sthKY#x@?Q`9v-%a z4iE3Xc;Bu)TQ9JFYi!xEW7pQk)?F8F8y>F935aAHqSy~NCnn9pA+*BQkmaDU+7tX+ zYmZmiqpc36ZU^+I^?{RN*?3h$CL+^BtsPjbXwEPkIf>E;X`F2%-{pr=t=sCd@h!38 z4f0Mh#AMgO>uy0?r2Ch2Q@U z_%e7YkR0F-!296sp9G!=9t9o=?h9@QZVMC#@H+5JpxA#`fN5|l_$EC5`@nm_%fJlS z33h-}!I$9q{~df7JPIf#z@^}U;HU8ZUjqLD-UnU;+Tf4D1HeYG0jvk|4ftW?0e=pj z0{#q~27U*;3Aw-|I31h@WLt21@Dt<$F9FX3vN33ay^dlkdQ%fj`#AI4geIyG?rDUzR*gj^{{k;hW zT1$U#OP#M9%F~R>b#cZHuX^l5^xpA0SfI<;4#Jnvk$!uFeTIkepNj9!6CD$$M0U(N zw=vnge9;~U10uD(>(*TIT~HU>mHy;$k2{)!q2jT&wa^kjO^kVoU!7|g1U-E?Z06Bj zW3A^m5*z5-rxqJ}CrOljyR{P1*}JhAEEf^e6S=L0HK61{AyMvdJLF{vr(~T73e@+->|GDUAa^gV#B${ku zcc}xzxRMzTM0uD(;L17RT!|?kf*3E%iriyvRTmFhQyfW7w&~EcqaSVB;>@Cp0v5RS zt^OU=&ElM;VaCEb<^eg}pdoMqyCp;`ObC;0)h9-CcFtem01xL&E1Zsdjjh?&UMUZ5 zIkOg$i|30Itx4AwWfxGr42!cEOIbPNyox^KoJbA=18)?EXhAxv9j~gZ080kR^-R+) zfy}GfN~0I~d#8y{Cxaj^5qD}2wRF|+q2yz~_$aa=u&huMa=vj>-*vo5ckV+g;gkza zVj$Zi#$w&{!dWe!EyiZCBOsNXW(nCnGIY0+6KCY2Z51$R(Orj0b&b%1#nL|8&=VWE zq1e#q%50fSGjIa;7n4WntZ*Df*q@;uZ1Qo~1IZn=&)IW*UtWw|YeR|Rn@qN3V*93$ znP~-jUFr4dIX8`eso3}Y$QGL(gabc*7rIkBjFai{i~e|kl|#%ZzY^)MN8YHM&d>Or z5GqX}99*21he5GnuDm*OZ zMA=#{I!ggBLy2)q=(*VJG>YxdUi1!7RdV%{cIhIo8Wl_kfT7vu7Q2^H=&!^rNx80g zAT)Eq&@&>Q(74N|jy+#!t23Nkj)l46z z(Fnp22~m>^bAIR*o|QQkm}U++%W9L*MuLgUC9Q}fbxkAjT=+Ox8KTOx%REb0U^WniWSqyyvf zE-4D@1x2v6*b-BME`F5u4KPT-gD|K9;ugSUgX z0m%#g0?dGMa52~cu(D!jdH3f>5&z$7>X&IKm`#SOTM zahn58Fbot=P-A&7@H-&;XgcV_9?(FMVYf?>d&e!oEtFVlo~oaOnNeY|4A0i4??uUq zGZ|TwTKTu^P!bYz7nWnsRU*j^B0sx>?p!9z=Cwcu!G~-`>e5(6HDx9zqHn%4Q~MqP zh{L)|L;zOZ;kutfTEpkc7$90TMp}g|T{f;Z^du5oechcuG$7g4LVx#K!==rzr-*O1 zx!`rKxyA`WA>HBUkF_-+9lh70qn-RVM^1I9%La^OZasqgnUY_6hPh7}OER3~>iplZ zjHZU!6o=V?!XBsBwaf7o3cZxY_G$E%=+VTg8q@0qF_7c)qCYu}T=A8>NZ=q?wtc)| z-g%rIgT0c)baDyXbJ87TxiD3Q^$Gbaq=34}Tuvg2(%%(*Pd+{JNE+&CJN3QZ#XWLP z5{Cmi6cGD77UWqKWp80lFYV_tAEqp6j{?IWdt%xY(z2eq#C&RXPm2F|QTsh8eHE#t z5{QX7DzXFBW2htqGAydR+B)KM>Lv7CY8T<-;YNE*n|5$Sz~+_Z=5N(Sf25=jB~DjV zqxi{)qO^*(s+X!ycpmB#{-ZI>z1~&vBwiKG+Da^2>o@MYU3UY_aw%+}u{)9tw66`# zHym~}m1vf}b~m~1-pnQ4jk`U46a+(x1OuTze$ZcvYGo2=QRBBu)R(NPj1bc(+F}<0 z-#kF*!Wg&EjA<>8up^U0)*Lsut&XG7k;#>L8a1>0Fgx7dB;Sa42aR?!);HuNg7tcC z#+F0aLcIpdNB|Jd+7{4#Kh4);Ms&_k%N0&z3aP{+0;Ar=>GzO=m2xTM!MefQxlB(& zu6;-&0>@X%R>w){KA{azE=Uf6d>RM2b(kNFH9M+&e<@L!L5$20?}jlB{~4o~$Qjee z%S#i}lm+%E{i27kFvzQ8dnGlMH@uo|9Za_$-XxIisp|=9UF8!KVYsMUWR1rDfiB6Y zT{uo8*R>u=yg&S>@(oFrZ=fHEOIBfpm&GKewW_Gxt{U{{pB0Mj^+wOKdR07(eGrTG z=($n?ukSKBEuF=Hzq?A87{!B{Y5~lXD?cfJ-I#qq0-mWLyo?H{Qm6X*h)icpw3OXQ{9f(swxEdL_Eb*C zFI68UkfVh7OwLIgi6hOQt6eh=dKo2JR9E|X+B(r>ow!Mo+eeD+BVuNY1bD^=tjHJ70Z&dfslQ% zQ@kq4*UL4iR6+6o=QH`_e@^`WC4pi5C4Bz(z)OH)0XzZh1UrCy0Nw$-5B~nOU;!Kk z^I$iqf;)n5z}J5rJQ-{OUxSx_FL)0Sk1w15D)@$YdGI*!Ctw;p1c=8!0f@(cG8hNH z4}K4P3cmb0@JaA^a4C?l0Qm#F75H~}^mlT{(10pFb$@_L%~CUd;-25K7Aa>XTT5O z$=?E6;9T$n`dxj$kH7bV^T7Y||5w0gz`Jjz(_K41m&N9%4!~$3Q3^(7K9*I#7(@8G zJ`+}y*;*uo1D;R)2oq*74+S)6cT`M!a-$f$Xarrwc;% zqLjl;Fp>J<`eu;Rgjo}k%BVY^4=$vE1G#}iV_9ysd&as>f~Y}G!0oArUl4x)u{OPS znh+8edq<1~nm(`1K6u4I)|&ZqCiRadkmB?30GxXzWK~E}<3$a+FP-+77ny4jy52G7 zuiPU^PCa;+%=dqGCIt3>8J5C~)>T`%&l{{_yO6p5SLf%ni4g~85VKXMVc;rEc9GhI zh)CCq65;B%KFez7dddOlvphlooLzMUy^Jo;Vyce8dVdz(6)Rlj2U%QZN+)(PnD5oA z7ql1TMhdBol)8(^{U&D^VmwdjK4;ThnVQBq%dZF1n^+1BS4Z2e&NL3gaYcua2dYp^ zeZC9PGRvwYq!^y&_Q6DREujR$PLM5xcwLB$+9ERD{e^M9^oidWRexc(If>n@T^tf0 zi%TlJ@yj)a73j0g9etBG*5_>uthgFEYSRCl4G%jj=7)4f=3ql-JCe1EIKia7ZYSt6 zo-nITuwR}#kZC(@Ahrctse)`OaNjqD5URjuGChiO)#pmI=aMY1I%_u5>#yA45{rer zqliz18!lRk7x+K|5o#j7>^sIDS98SENCj4NCD=_Y?@2^Anhob;0~%IGtaR8{P(3R8 zfp00Iu$QrB>vNYG!;)J1;EFn+w6S1TSDGBLKG85@jlZXNT^jq~=Hz@3$Bqp|p>BK& z$98;6D7*OJXg+o}oI_Q{Gq6i+Vlf#pvDQnVm;HZMbwfUKWifD8Y$i+2-s_eo4_o_( z^D;}qAN_S%7u%AomocyOl>b_C8x5>@nD# zA(-=Jk?(#Q()jsR%0c4*)I$GP_h{YQM9C*lSS|f5{A8`gw{%raJ4&{*dtV?O4z5Y0 z0tuu}bZ#B`P>;3qIMty%bYV?v6pxGMq{nm=q?XG^2n=C@@(OrO@&ED#sE>UAJsE5QKZlopK4^n!FbGZt_W*YXpM$r5 zJ{Siz@L%xs-vyrqp8>A~SAw&_Z-W!So8aqT0bUM9@bxbR7Xiil`#gO72f+Km zJorO!9sIj&_oa{D25t_-!!LkS!71Q6cy{^qKLseh-)+I`;M0de9mqC+6}S%GT(2#2KNUWfqeSk3VfLURG&TCF?MX2^;OD@tYWAU*cYrl&oAASO2Z_OK>D z_4U3NU)TL4ZVc5>_l`|G0(IAc%fiiRB`SK_))Re}sGw&-5NBe~b2-J5h9BY*^i&%* zCDid=I7L{ZoE(UWV;(pMC8T?Ns^9Df9w$omTvI(O#)%5wC2C&^U>AcA%g%k&ExkKG zZFOK>yy)t-++`bOc zy+gy{I?#}|z~ma!nfiaN59C@d_TWjlbJj4^LVj?3oK+G;PDL5!$G!clN4a@7+MmtP&?F>u$G%bFTTv%oaFj!2)3x}Zp>Jn{g1Mo&1+5#(uw zV7|viYJ)o1y;R$^g{~&~y42`E7UW#J*m|+_n(gB0-J~at5Yc7SS8NaM>m?092*FHO zfCx8I)NM)iET!XwBCBiQ)elaL%c+wnVjDJ5Q{ga=G!nZ_NE~e|MeR*AtLI(#V0E%( z8A_q<`u{*OmEuUI41dD zR+~JZPQMkh2`MFn1&%S_M3N2%=`=Cp$=v6+b(Im*;>|EFus7=7u^c(gNw}BlmeEl> zQV?=vQlvJ~$Rq&3;J1T_&&|B%xYsWeBt$SRQHMxm_!n(3Vr`$Mb`8g;Wz%l;hlN<2 zy7FmdA}J490t?QW#I`x&HfB(SW(evA932?2oi4rDnUQEnoc%sN5;dbX*|d?Kk%t7I zNUFGC$6_~;QMluJJ*K95%cJrHS6C*p_vwzAYsIFeb%zd(ZLo^I|JcYdTVi}QF_AVj z`3y4h*qF-^h91$M^;`(@z_ne3U+l)T`=aToW{luz4|X<*Nx=x@=T`4RZASGfHJ@rM z%q7aUre)30^_Mm?f?^%h@pL*0#sO(OWh5 zC)$E$+Y8fiTMzVraf#Yy*b^t`@gb-FfV7(pJa|@dYbxZ1g<&q$dW;Lm>_n-uBGy&= z*_;orT@GN!@aerBE-$PStqsjQ0*W|Y5C!22B15=N4;L%9D}ivZi)QlcKOHhw9X0-= zTCyG{!NLi+OkuPYM65K`jwlbxo{w}G#3f$OB(zJtF5Qv}X8V|!B07_E-?o_Gfy;#j zvOq6H3ZeH4HpV;E-6I}AHLj5wlo+}1B0z~WY8%&YI0JbQM}w{LGpqPvi|2**{;*`# zN4aWD#rL3DEJ5uPCCCaq~JU@F?#FXK9F`v3Ol-{Y@C*K8Q8B5eADq5{$Y*EJ-yi&)f zI(g?5w7f}I`19kVKotK!0dFcjwD|u^1E2aG`2KH$cYtfa)!;H9z5n;&`M(HW2^0t5 zD3JaCTJTZ$|Bry9;AEhC|Gxm=2d@CT!QH{#!0&>q;rnIR|6(A$zij*0fjW@izZZb# zgPq|2fO`Vj0O+p2?)eXaGr-BKq6cg*vj{b~d9LJq3NarW~`#u8Y0f?N9>>&WCI^0&p|t< z2AhY+QwxElyl`F^3l-_qzDxCEw2xwKp;ouCO1YSf{C1h6Fmy$JzWTfDRe`oqA6_&) z+ZyL!jaq8&2~k#AZ1?#EVVFdg*xn6t_IdKy{eG(_tHd`7oK-PRiEQX zV_0`xPa?zK4XwC7REE9ZKbc|4kB!cNE9fFLj>~D89G4k?nI4Z;*May|JQhObS2Wv6 ztcqe5ec8`=shI&y-^C@zn@jkiM!H**jr0v@PBQ1SovlAx7;jrvhkiIw1-gx+>7y^kZ!vANxl`#X(?-V0CMIW`N0 z{MyCl{3>oi5}UE*kx%Q_H{r4tG&aIL7tC|f#lnqb81es}b<~l#? zq>5YAFma5<_4{dDvHA)LWRst^rv_r`~?Y&Q(*WmW0>jdN2BI4c`%tk|R;CW+}@!fY_J))p)RR~$3r)XcCTOuv3w-`nAo>7& zG=E`UQ+yQ(F{{MsE#o$F?4qOP%1os0cg4+Z62>d9r zfWzQoa6X7Kg1hqfSCIug1w0vS0N;TB|04K1@DlK1@Ca}U*a&nl;1%FSU>Zz;NpLUl zCgcEb1eb$Fp!)%T1l|w-|JUFd;L+eDaBFZYARmCQ2G0gh1AhXl;Jfhup8)?3o)4Y} z{uVqK#5(~mK@K4Kz_Y=Fzz>lDTm!BKuK^cJ!=dACoUPw-Ud#jP$PBBTj!eA}9C_YFmj7;#vU57Lmx+=;yI&7=!55FG%E2R9E>Ikc< zs%uJ=jC#)-++!PAqTy)z)4sQyp+Q4G3uc9$hu+_qvPak#I!I zci#3Xqyj_TmKk(jKmny$!cb-ht$x0bn) z7aDOrxG8nP66U12EpJ7sBpXr4=auW}B#hqChxsn;X3e-+^cUHjtnHXM?lB?|@qa`S*Vfcn)|v*bXiL=L6mIe=)rKi@>wMZtw&6^{c_# z!D~SaJP1h7|8scupMjr(H-ddYzWh%Bx*sqK?gV}p+y>kXyb+%LQcwe{!Ck=z;nUv) zUJDKa-R&O(7lT`XZ@{O24ZH%#7ylNZ`vHpcr#k}wL0_xC-vh1&MTTAj&E?Y8IM<6e z7s>a&6WVMU7*4qGF8+JCsyJX4{|mpf%_#&{h}@TWMW?KMZ4ATX-#>TDUSp!Dx}jb{ zx)l@mVoW4iJ6HbW)WXsGp=rzl;q=fJ#GHo90yGyP^htbiv3#iFWYiM<+M~OS0t?PZ z`sgZ8p4GRXZn6`g)JJu$fY!>YTy~whBqCe}Szn|E)%VVy-sty)R8;)@zbs~$Y`16p zNRXBI6!*#%ky~`53A}>rD#*QZUAD{nF%0or;m8zlOUyl1$kY8y9G}I_VK14q=Ao_f3Kzja*fbIj_2`Jv*caaHv0K5h~4m807!O1|r03|Q@H}E0w&tM*$ z2_!3c3+RB&;MU*+$O9e^9t9o=X2C~V+4v=^ayG%1sCnSPEY7X9!RO8k^6cd{ zpf*4|$ukU{#%hhb5>Qnpxti69jTu-wfRC%wH~C0t$kXvoGO&wlQ~19{pviFhHywGA z!2ik2WRr+&73H1}Y3LY5PcVyheXGwE3W!?0$cP*M*)0nX|L+Qd8qc{OaV66~x~(vZ zX-GwH7HZ!8WMFgJmO@32G=|PN>Q_v5?b_MFl~gpj8#;5t#?hmcMv~KxCihFlo6b(O zVp^Ly(Qt3*%=_|ApS3FVaG?nUDs%&-2KRLZ_c0y!Ya6}cD1yw9U_^T6hSMoN_?{}} zSm*-#%Kb=KZ{>5KR=Vh>3CQwWn_P{v2IFiOKjFHjJm;~c*5nJcHW1A6^|Gjw$Q@^f z{mZ`W*UR+omL#t2@&$(F%Y7bu&n^3Pb#mE~ir43y8PfZu20N{j15wBwzXU^hTcE4IS6Vi}4ObUh^&S(X!=sFLO<&e1{Ao*|C8mBZxmW zE04HC6Q-w4IaaNG^t3^m>N>8Uu3x{t5}w;Hd*15{&x<0bTMM>DPvns&V+QPo%|nIk z*eJ8&&SY;}&km+>}fpQ%R%sv3_6>c`Fj zPa@TGVd>OpCg)M;2t31NmCk#ybNSF|NhkK`!_MX2g&R-JURO=^*m|(^JZnMT+`@Z$ zcj+D^S32;t`&tKXJoU!xz;oD6)rAx1SSR|g^ZnO8|FxT6-F>T7TI*aMdbV?U>{yer zF*d$B!bK9_Kr?qlYJCtPE$yrF*YPF@^tDA>BL)H2?pxvJxSErY2i4jE#ZO2>nJgS^ zAx-6OwWP9Ud5owweI=Ya6=fAa*GmvTLj?0V@vT~GlHdl)EcExUiI*KbxQud2ayhl= zhT#z-EsOsW;yM}!n@hFvzN8)ugpYK*6KyP}Ih~Mi&s*sBDE^`MXR)m$Kuks5nXq%s z>cM%0i5$7jQM-$t^_WSx^A7U~oG{v&+3t5?X_>Y2cVRSFmT7QbAR#Mp#KCp^&0%iI z^&IYm*{{3_J+GyBCHGD@XW-dta1$z%x;oDS%WzXSjO_N~7!I^B zxLMfMFq%ED44Hif*M+m7pJ8=SIUOHy>&kkyGzLp+CJH;8>eY`nG?#`&LMPFP=3lyU zYUMd)rY}J&D*gX1CJ=Vz#{d5@zw~)0eEzlI>0koL58$oAt$=v{*Mp~lQE(R63|4_x z!Q(#wL>vDqf6E5oFTfLlcz)UU*T8CUb0D99pMc+2jK4>Lhk=WMJrto57pF0+3z6Ey3I2@Bc4&CXhdXbHO*@?Oz981)d5X1$Ki$@HKe+ ze**sqUIHEmE(Lc3cLiUE&wn+z3=D(4;9lV8@cC~7uLSb_FI#~xz~6r!yc)a;JRh75 zz6d}6T<|b(F?ax|ffIqo-~xW156%GJ4t&3S3BFn5lYO*S^(nF#)8^$xzDgA&8MWnX z2prHy!fC1S0^`E5;_md?Bb_zX&6}e;y+UbY;w**&<8w4+c|>;MM#3r>euN`!z=O|A z%C5^diXZ~nW!_b(v3c}zFWZPXXFEBK)v{fmd(fkJ?XGW`8r?fJab7$Lh7|Tko`Hv| z6XVl3CE2f7Fba+?PbWE5sH&(iSzU#k{Fg;GNdlQPRIi#HK&>^$kh#CnGxyNQ_H4rw z;ooq-lQ)<4S%5`#u7vI^kw&2zo+0;{Mu|q{|EyVd7)ED_$Yt)(@S=`%8h^+H{K6M^ zr@Cj?)+wlO4m6oOoqZz=MR5pFlyf#6OJBsU(C^apnre^E-ur7#YDqP z_*UEi5=?B*ye-GwacRWKSWzYP$);MbOfC@V)H-KzdtoB@i)E@>rAN15eM}tP(Wx#o zZR~sZ*V|*#T=?LfJFHh5=i1Dpr+3(tEB12IrtLN@DzWaxMPU1dCypbtWC>Xo$|_Em z@a~G!#nWBy=@N=5?@~)n7q(wbT{qq`_kiu``oW1A@1>MnQ)7O^1N=&OfLULn$?+vE z8|>7?Y^o6CAX}0Z=nkOOC+XHr^AB7xfN$M_D=wM7VxWE*-g|0xRzvfyTfP zu5ro14<4fZ1p%v6`A|)gy9>~R#dz`nM^A5kUTz6Q@kx#?0bF`8hPAx z6#3y<8_0Z-M$f34QM-b>R9j=o7X^7XMU8z;CB%%K8N)zUCk}Nzaq~xe9Cxp}uH!YL;d9`~i!}5~VJ6Xkh zq|d%f7Np)$8+s(y<5J2{-yVs>H|g@VtdU|*Jc9J)CCoz3r%JxC#s6=Iue?V5CHnt| z1|IWU@cHip{}=ozm<3JnKk)ef1{4e6cfohy@Ba?`88{u>9o!Av6?_=JehzfN1khdn zx53jlfqeeU{{Iibf5XS?PX9-M?)2{fcLMVN|2gnkARhnk!BfEucmP-hZVtW$U;lOR zHt;I&3@`_DAK+AQPjF{&CvZpbzwr8!0lXX>0`~_03xEG(@b^GG|0Uo=pu7Da29E~! z1@{5>1|NdQe;jx$xB_eew*l{j$G;rx2X_Wvfsg+i@DOlEAYXs#>znc2`}zOjU;|hW z2EfPp?j1m5sSl^d$e~WPwgb2Lv(3r<12c<6r!p-YvuoheK~L6C!aQ(xCBCHJ=eoe! zOq5JL9d5*gH0JGOfGexf(=5d<;tQxXW7noz@>}mWC$&0Y?d8I^5?;1n!Cy*WdDx2S zor%s*+!ha3_q1>m1g%AUtEBX>9;ln;VP;k?rYqG(>z!QAZ5x%%O$24uoaPp2mSO%z zu$apGUvF~`d1d&+3e(BN9~zD0-pNR#5l3(rMD~Uh;>}yPbZ2M%WMia=tx^)p(;|O` z4Y2QCiXl|Io9v3b39w_O@}3G@-?BZ`qk|F?UyD|5VRaDUU%7F+%KjpH zSWxdE|7IR6&Y0_C7OHt{gNIbObBAAxoq9VOPWCLGs#KeTs zR4y>n7-K}&?#Xl;^i9nA#+eg2yz3e;_?$w2U*L<9lE2jngb0^!5CK2KOFpy-) z#(6VsQwHw+JdaOQ=z&5DaB<%rF*71>jOS2E+>mKo4eM>F8c&8EJCg;o4kkw@HD|}E z`bj4p!gzzg#6D~T+t|w1WEFga#+U^e;TkA2RdymF6CW$XPvRJExfG>rnY&IROXHzUo zjY+?C26h=Iah}Q2txhvbQzh-*yrEZ3%*mmDkqx%i+~%kP(t2oEWyW|s zxw_O_ApemSm2<=Fp{1GT=#XwcHbSCY^@aq4+z1|-7$bC??@TIBAD#QGP+Z-sGj9sT zzUpb=Y;&Rq3p1U#7)^z7gj$m|!%Fvg#wXhc@Fi(fjU8SJ^_dFN)TW}B9YW)2^DoZe zOg)>0tM5~$HaCxsU_VMxg(98Hg7ZdE@g>s*xmv|~oF4^#cx*tjeT8H^GBYtcbk>IT zSPUTnX{@++XW$b8`*aEUn^KCsS&SHMN$BI)nEVZ#V%ji&}Hp?pKnii-s)$4Yz}f3I?vwdZVIYlcYH+h`tigI)*T zECj(}X&Ox}*QG^M{i|DcZ)Z@6)Z>YI6mf8FZl*J|ZrwQN_WXf5fsoco`LxboSjYba zK3q4lY14+0Q%0Jt(F3DrtUu*I^S&GJ+d6%0bZo4->Ac9!;M~`A!)HSH7S#Q$`UTS@rCIwv#62NYC)l8e{GzTUZ*P=e^rGCsbLMpbX6|#!> z;MW=RivEwEFyeKl(&+g!iozmxV5zZX=U7I4pmj9kc6F#ihs^$8JpUWvImQ3S_I zHwP{PCj#;O?*V@Y8sH*uBDg)c9r!l<{(Heoz>~p2umwo}e^2mEc>im`JHUg%MPMHo z0G~nza2k3L0z3^!K5$ELExiBJzyrZX@I!d}&jR@Zke&Z+ zz&GISzYg9Do&f$B$lw3>;qBiFUJMR^n}dIbufGzk1%p6-0KNlH|55N}ARYf-0mTaZ z8T|W4!0W-EgDv2{;NM>g_5#`Mt54^_FA4`R^hzAquHWT^PS|LRy8xFfTii0VJsS?W z`1sjtC0wx50_n+2hxm$DFZL&oUK{;Z4vbz7=x&CVg;$}OQCVdjmwzltgPHjRl_o2ky2c`}S` z7wvD7iDKn+++aPei`E=jS?#<`-{cl(XYd|QyS)s(Mjz-jUg(iwmORVQ+GG+5sLU2h zcLWYrA7yA~UXNz3_Ab+Ch6-ib6bhB?aeF%F0}5X?)SXhZITWw zU8D&QxP!0*nrpwMy_Atk7&$-purLh%148d>D(Yq!w_S_7e5{jxua1`Ss|S^j~mP?98lT=Tc=Hq z8=0=s^{wWVO=-+I2!mw=S|M}Ms0>A9S+o43$-Au>9dnAx+YEl$3OTU$GN;1w{!^*{Ai$OjGQVgXPL5Q74XS>&SZP!(9)AP#E%QJ*AnNd zH}zr|_^c(dPf~$~NLyo7kgKrC946LUI#09~=8AJG+rYXzyPR>>9@QqEZ*Jw4%^n*7 zrp#))mbF?dCHz^foka+*BE9R=vwK|3*HWQE$w9Q5o3ok=R4-JbP_ZRvebi|+YaJCg+{H)>!~WbRjD2RmMJrkov7=4EeZWsSvZT>chPJ$o@CU^ zszRtYdX{OC6aa>Yf(@kG&q%GCSJo@YLklOX4(FM&@y>89$}Xuxe{|J8cG&`_AdZZ@lE-{}oqUJ6)?w1wp4bFD|f<T!6Ot$BIsPxQ)ZY&e@|qGK7m%04kktUK|>ELcIrIVI&m4FS4)uvXXzl9-P-C5 z6(6eZvilFH0!bqKoFq*qAY3BN(lTG`=~@u{vg0!E>=Mc{Va)v8okPH0wQe)^3USl0rh$Bao`c{Iq~ z+}V+Xdq-v`W~j47X-HDp60~KDpfzof?R2KNHU7WBxc@Zp|4shZ=Z*0E!{GP8HSqU~ z;8gH)c=_*wYr%`aQ@}jf2)+w1{~hoJ@MiFj;3?qA;7LGy|ApYE@bo_cF96R6e+^Cr zgWxXU`|$SHflq=@fMj1sA3O|P3?viyPh`Tc(pcm_Bd+!df3M8!~%f1L?N5LOas;8b6^&_Q#t0Rg_2i3CkIEoESL|Ii8&< zL}EwPd!SVm_!UYgV^;HpEDMn#c9I4kYvN+iK11oi&~Pwjl!qCTSJF!8Z*PX}H(ApV z1ttw*3kf3e8?kYv3LI~%90=xU()Abf4a-?R}S`M9eSP;eiV+ksBIq2Z``@9ySl z+;{bu6K@U)HvOwShlxXi?u6%dGA9o6lw)EXpnvzZ2^k;c&$^o>o+GK1*b(Jf(CYfY znVzfeFC$ZPPg7ZYs6ZI@KsBi_-ObwRlGZFC^p^-?1V&U?J8PBdTkZLVt=2D4R7V$) zDTG^xK}jLbB~ZwA2wT*dMx%zhWDHSqvFGgc4;HR*I)!KH^tDG(5~Av+WL;M0SSgpn zOLc8)MQ5cGsd{Vo#pK$vM*Kil^T~7ZLNzH z)VHNKByW%9RrGz2uE`R}DJ~~lJtOqG>Zcsk4G)=>cjVcsEI%0|?4}e$zGvrj%(1BV zL$SfJy}h?n+;nDLa}IPQ)+j+HtA15Q^eOGkJ{8xNm4;CdDG}wtcH5L%#pR}xni8Bi zDecxZwQDtwT<0ey=hiYKn=$x#8v~lG_rg8o;bEw6Rt=HGYr(Yz@A@-i^rYj-U$RL( zoU>P3l=raHH^albFW$Fn&(;g9-x^zX?AW!nv31vl+lGhh@*{#%**TdR9VV{1-J%Vx zur*Xs_h_v4guBk#<5l)(s{`pmjA|Lx^jPB|6R|=H2X|nxqB+BGKTU zXsT&pN*akZ*}}jYwQ(#zyy9Cdn#2Uh3V|}tzU9D{#(XGL6W%XccbI-KD`WEj7pu8z zRPP)f$9yfmzgaIA7JF9v$n)Oh^$!KRg7erLfHBu0h=bAjshJx5-k7r|y?gyYq1$y+ zu-w<_`i|pO)lpYq5DFxP4mXZF_x)Bc+FPntSrI7yRMi1r=*i({q?jL)m>eN`)O6th z3PBHTj|<0P97wFYS_|xdF!t~YwsW>;$JhBD)W8bNxAPzz@PbFH;kHqvhKBvcaE4~} zl)OF>L(sOSA1XE*)kC(C7uuGLTj6GspiRWc*fw$`xDI!Pe6`(Z z5=Vms25wPjswQf6ekOL}T-*46+yt;4KKK%R{y%`%0LcN)0Yl(spaOmg zZ!g~e@n9FY5S$Bs1W*4V@UP(A;9cOkK)nA^a0r|Q{s5%BzkC251#7_|7yw^^zkdPP z0~F`)z3}qy0at^!gTr745Wg=UfOi2OgRj2|ycxU_$OqsWa4T?2@SpJbuLe&7PXx2z zUf@JpN~t+n znBn4a5#J+Il844@^{P3Ol4V^m(MgXP<&l%_xu8(b`y=#Cc@wGqvg_E2^3kV6-|JD> z^78#KGBt_LWxMzsHdza7C(_x;J3q5$3J`?l@m|kQ70#wkfymNAs-(<*^&Z^ z)OtJ`g4m=dbH(n1vznk^$Q-Fu)FMr@IDlPTVw@WDFmafuYC2kEmc;5MuZ$~_u*^iw zE|;{HGHbPJU@TrNG|x=DXzmEbYNbS_!m~cl6EG?ExJ{$8D3OEZKl(XLuHyQW!bUVi zp(UOcE05_oY9Ceh08&*bcvRuS68rz zBcGxwXqi^@NBm@p_V)DH#Q1y&%8<;}GQ(F~&UDZ4unG{HOf{QCT;jGxaT&^v!}V@H z1w*=Zl~@V=<{`Ex1bJGBZ^Np_U0L)@kYL(8t)jXel5T5gIO~YvLUzkZK!Rxwvk;PB zHxr;&neDP!%JRsv4Q}}}BD*lt3Y<*SUQT$Ci8&+&mr1m$T_(;b!eeN$JI&2L$vgsi z164rpPS+gG7EM%k(nTS|C0ElQkp}H4FxMR>-W@2!CXpMU_ujEAhK<2GoQ7 zg4(dmE)!U+kZ(L3G>;4K7gusf9ha}R!-)|tXC7>i=99K-#i%ArE)#t@RE=#=`_#0; zDhM59`;L(UOatGb^m0Dkt;Natne`c+9sZ7kN(M}+DO|)v(JQh<%{D7NnITp}b=Ner zAGV{NyjhQudi;$y92)Jm$Kyr0soMV)dZZ7;h2--0ijhp!a>e4k?;Sv zfPVxna0d7q{Qh&m7-)e@fqeh10Uw3m|5xxV@E71wKz0EeKn464JpaeQqrfAUv?`!iX)oAP=a2748vE4gvAm^W2Y-o3`^+@9Vk?eP#%km9$ z-RhE6Sm9+^%cZrdX!N>j(4&7=D7M!dJBT zEwurv9%v$FHWhfQQ$dxe;}{~w&e2#58p$|G2QsQTeu-J9v_bhyC``Q`)v*a)OysS) zo;B;0pOn6?;24n1XDY}ugA=95amHTv^%0-OL`zXm@q0C)+j1g)+fykWzf^scK$j%r zvu=NI@f%We<~En!gJA<-Wtsw=BcBRcJ`I0?|PiS5w^G_H-Dw; zx$+1BDZ2t*$KpKAvhm4d(yly8!M&GE7r*ebWV0E%w%93MUoY37I5<$AM7yv(WuUt) zy;7ymmS{nmBxatdgyJLb-C6u!+5O|#Dzt@WrxlZF#vz(pDOvZS@7A_4o9^$8({C;P zz4aeoH$*;|>9=%FcIeb&Wzk8;gU4GfHY|LJfkM1Gp@;FGifdoJX&W5$=@q%IRs8 zB+9OW)q`1wa^c_vRoeh(6wGh5AtHUjPok`j5Fy+6#aNt zBKP5-58Vm}F7Dgzqc7>}!!z|ayi1^^YPQmPD%I3o{*L%qOIHmaN)A$P&(7|CcFWsy z8KvG!Wi9^NO&zD=BKdLH>3UEkEtSw3`xTCdUx~C;%cmb5aMVFnqZbXE(Wd`j&t7~r zI%V|9%1n-59e;s%g_-i2l zf9HUc!H?nF-vRy>Tn?I`2EGE1{toa)@K<0vxB#3Fsv!CTd?vj5Zm=1g3BT{qF~cz!~6|@a1m=&jT$m3P!+L;CA2y@Co?w{{WYRb>QRh;%@-22h#UH z6x|e9-d?cvh1djxWZR0WmGV;Vl(kl=nVT)VN)Ig%jF>J2qqJ?KTV&kO; zs3I0Dl=HSg2i0PIq9MbYd}|_DJABZCo0IeIC!{tuuzecgAl53V@`sNfMvX|>%~~{YzT2Jh##En+RPL? z*2K&tE~`qdQC6{s;_f6(<>UlTf2tH4SdCbJ8k#gFM&?sE1$Z9m9?)rLWum+NB~{)*hlfcres=R>)2}yW-U6JF{62 zx0|j^_v79|`r%|5oNN7`HPrRDWa(<>rKgO|@TK_uY&hfjtFBpEcHTABH8HGXgF3;x z!zYK|Z8w3mF)-e1fhY*`U&ZAomHr#h?tme#{NSMp_YCv%#*z#tJ?FynVI9uH&^*J` z9v)iP?aj&ru8*gfRF<-cgh6&fX9HP2pYs>X$9zSVVQH5RY|qJ@&?T)5`OAsFti|m* z8-Ww`cg!a_^kW`LL;X>wzSrLAm2;9fGK>CF?DJTVKhHj&KSa`dT zv`2wqkh5(sWLqdIr5|qMT|?E^Qi(Kp=n1*Q(Bl%39M7|SLksCstAxEa9_~eD36k62 zs_Q(bBT`ysx}qAzPx?((MLP)cZoDz5f0ZwFVBAc*jO=YU4I5~5TA6I1eQjudez2pd zG*0NLmU)YI63S#=ac`rKf+kAgSg?H>Kj<&To4q8^qDHI}{duWZjL^2lF7wLf0Ze0~ zU95RGvOL0$Ob%Id0%tSMr8*fr9TuCVtz8#Lv^?F~Bd14phfBrTMTU@*%%ra8W}I%J zUV~*w79^=wuXxc*`u_z;l>Z3)fAs(NKk)pz1Mm#+XfOrt5AF&i3-~hl5|B?o$ph8` z-3PcEP+Wj(kO4eA$OFCy&;NDsZ{U^S72xGy1l%6n4qOBO|2N<{;IZH_;1uv}_ws0@8}Ry12eV)m_&I$2--9QBjo>Hn@XrO4K)(IgfEu_fxC@Xz{<%Q+`0oS04KM#K z@F{Q|co{eX&IC6D74Uj^`o9M+1($+T!H?kQUkvtuQ^057;Uj-P%is3^>eHu!kMj5B zUCN4)<|G_Vb<5}|RzvuD!LZUU>m>=3i?BK$%Xa+A$Aqc} zTazx(?8QiuQ;*DZ}`wnvONnvjjg4p{@4cb=9|S@w#)&=l=r1!rT_!O!$Z}4pR!HT zcun&WD2mNCiIK^9?xPZOb?zXU((a3pl)va;QmjrXZBKxS#rs#oDK7BZF?;EIj)*@nrz}` z+Y8g)LJ@i$38KV!+E_v{PtN1@P5r?{oeeB`R&i@8b`9p^yHx8jE(tT-!^0Xo@5oqJ zO2x#~Dr~m>1cr3(2jKw(yDbpS51qicBXrkE<_DR}o(p1eF;fjzaom2#)kDYB26DxZ zN5HHnjAGZkaGx+10l-BJqb*r#A{?TrX1eF0Rq~MzgV-AF`8p0-mVaGgU-V{lC*#a` zG}JE_7Kk&w5dI9@d-(-x*-mx$h=<3FYovw>jl{cf3UdJ3cH{aDXCNM8{n;8nvx?`m zxF{vj4og#p_`7t5JuNyU=@zhQTjSe$DwR_WdRo%Y=}Z>H=e$}`Odjg6KGcwn~81xNP1 z1!k!FMp~$tB?gdF_#u=qh56(brh{y53O6l_AG<2=RmmsVKup=N9k5}y7wbo8*I?c? zgZbzjtse<2|N3}0RQ%?wT*7uWHGXsBG&bg*tJY1eo$kS0^bRDq?czc0e!M#%o1NV~kbQOI zbV0~ow0F29EaqXvJ@kgl^C;yXR%ib+mAxS`A= zzgaNJNTyHCj!Kq~yTK}^3z4(1v!^_g zKFj^~ix?QekeypjP(SE~>;1_qt%uc!IIlAQC^|!fPnLT1g7yOR32nElGf?M{@eDDZ z+)QAe)9+b+J=o~PP-vLrsMVP!&T~eNh+xXMTUusWm4v0k)7&PQXs#vhcGwBBg&|HI(zK(YU%_kSArOCY`f7vcBc1^x*f1-rn7U?=zmy#Gsp{QqAF zHOu-?`EI^ehL5o1~3aA4*n1LEOXKe=AY5tYCedhA9z*@=?z1Cc}Vg>+M! z?1V{PQG*hJBi=I=XmgnOVe-4|haYBM6Kr)TVC^_FM>2l6+K}iV9U4xRF1{ua!gHeG zK#tB1he^Gg8v+p^NnjW!e4vlT~`h+*4A}Ej_Jx^ReQ$@f9quR7G zeIBnX?~RuK7B*wlx3VhqjdE{tg!j2i=B6Y`ps2!GoXL{B%QavtuY})B@Fgd>GbXJAV=;08Z+SZ725p&rW^U zEa5FZ{720juqIM=Y7L$^717T6LSC!qXSw!)fKVPuJ$X$r>1|LwmHmks>&k7|9En29 zUA?W3=USJuN!&lyP#hC^nZ547W#M+X5*6KG>xn)~RM4{^QZoVO4|^p&JvuKA7(La2 zMG18r3Ql$6@Ww>?9rM63IC0z4Z(GH4Y5!cIB z*;^f0)-by^SNDetIElsARI53SY8?k*>>F$}l9x3n7n+Oa>7Ef}nla@rZ_T#zqu?P5 z={aQ4_kwT7ptYiDf0BL$DqxF*)j_Ux6z5hS6BL z(Dm1n$U@HPXNn0|LgW=U?I-vU_KPD2%|YuyF3QJ|_Jzp0dN?6=H4~tLMz9(~TB=|P zT;9g>u4ar$;mtxMJGfjxBq9qr0P%n)+*ZxxG+BnnM~Y-q2*t8TAZCk^$zG{RzOr<< z7PW1#^RAI|Ufzv^((nyk&RdG@p?$ri00u9!X=*Z9*lKW+~$@A&-TOpf}x_IW2d^O5+VL_>hAPP0C&2Q@}Gs2YW6N5B+ zqwZ(k(mG;IMz+unDzMh!` z@NE8{1rGps1^*5I{|#^*_#}8Gcp|tA>;S95_mByE3A`D+2|N+VW?&C^5cne?`+<9d zZy+alGH8QK!3MA%tOGwoKA;!?F9hcU$qRmreBdqMD)45YJN(@w)6Yv{Qf%l0(c+Lr^?4gX_2dhyqUG3HX}CC4aQn(@S~1p{rY4<#ZIgIohGPr3fhUI$U`0VDh@#O|qkhGjHZ%!awS4#8U>?v9@Ff_(u11ukB}p}JEbec`-cGr{uKM=d z>3Xqyj@rV!aX7WEny$#Mb-hvM*~}LHSI?5ITu2`zf|!t7n`0*InLRDs*ISgXQnS=RsIT65*oh+15Dy92L{v6Ou+|y-~9K>d*Gh z36O1cW-K+9xJ6uMp$>bNtAX`QuPP+{?IYR42rx{yybG-NwjVz`H0v%i(b06zg_>dJ zFKdWq?}k=fA4@cQUsOl;nq>NS*}YlvAbHRmQ3&Cyxcr8XTK77w<_q-(SX3oy?2NT^ zIxg9j%qNHJY@!e#)cF5n?@hqtysG>EpYx#e^ z=iK{lGb4=?JLKPod7iJ9ckX-ld+yopxyyP;W$0d#dq0A!^-v}8nd!T@)Fz*dX5bXF ztx3MS1%@at2cyOT?~FE1BXBWsvvC{>n@Qe z%SuN5vk1ZvjsPSA)YqG5@YdzJE7(3iv&+2-bp6A>+Rs zybL@SoDA*_?gefSu0zg$6_8)RGr?-`b7Xtn^QRa9Zv<}uuLlRfZ-IM*TZKX- zJN0yyf%DAyDCg%S$D94&6;;(ewyqWL-dZkP!>LZaVS#q6@4C1`DnT#3nVi++uR%bj z)~xBk{eBr4?Hc9m*|6XZnFbz9I{(&Pj)8=9Uy(=T6*#zQCUr)ncC> zk>;di(@Iloe%ems`9Q}J?&9zlXRUe|0mKY_930Z!&PiT-?HCVnBPI^$iuHW>h`?sG ziOo+G;^}hM(~*-9niI;@GV5Y5R9$bG0q?d=MtW^IICO@wN!&D#Xy&_m^_I{29z>zA zAKB=ohV-rTx_bQL#;g{(2YJ&tR-FB`EZgNZRBnw^#?VsR>1rXmoxR{Zn>N~-S0tka z4_>R&-1%UVB*L5St;{6uAZ3Ktw3@~MYc>7h(Y2zATgBeWJ7uqqu&Sy)w!Y%#f_r*w za2KbN9>ax+KHK+}Gm9w&X7Rm(Qu9 zJ9UP-_S(aG%=WA1E$>Z2gK2a3k`* z?EG&5iv70;t^f}KQ4cVU?0+8E3KSpU`^f(<2BY9UKrsP+1ik@u$KNx+?}A5x3V0~^ zU*!Dj!8PCzcnr|lfHJr<_%`F9vG{V3?S1Ey$R#zH;!C(Zl6K?&xz!oW3Dcrn`-+?!o7O;;QwfCF%GccD%x#hI=Z?!wM!o( zBj)}nCNjD?rre;@j+=efgN9g4NuvC7SiTv~d#m8aFvCd<2KnMuhdB(LsuqcwwXmOC ztmH=~pER3;$m7{pLcH>lvWlOZ#ly{TH~5?)&}phj5RV!#@9*2lxf0yzMkl3tDFjcr zVx&1^(N|p?V_b8px(IjW61{C^k}w?DNhez9i`f`ZWz5^9=uxFZ@6W2&r?YIIcYJyO zJoAAwq^tq zqqazsW=gI^RFbu2nL9W$@;9M`LRWZBJ;M9hP~uIU+_V`i1ynAh{f$}qr6gLFVeMul z4RPYKly#$Us@Ikz(D)-cJl^c4v+I)EPZ|ZbuUWZl2IZ?c^i%4G? z$}s1dig;N=y4(U)9}yb3F_b`3b~3BSE3qer5dP0jC z#j)?MY&m^&RiVE9bd&9+g+6k-LF~X^n++0$QP}80@S>qqXB9Y&5KAun2R(-*U; zP`X-qw=2YL{$>tErq+V4HiDqkiWD#jVbaAyAU9L9Kv)qXgN%4DPgJ9ec&-zqrq;tq zkX&8t$IFg_rGSNxvc#&lyuBdb^6th0BkvuXvYLK#G%hf}eVY&Bk~tCz+KqM{(V|?^ zc#yKC!Ce7Dd5js@o5fQmjL&VGsm$;JS;jZxxxK6gZ^`*2`TvD%XQv>0O8y__mp;El z=Kn0X7Q7fd0bBq!fct=-Blmv+yaT)){5hzDhXeTn`~|oStOVaj=D!xqf=O@yNGGrf zd=I(*lR)Kl~v@ZCXvTnEU%t;;@H%jT{5;Rq)SgW#&7l=z!ZZBHNtIk z*lNm^)szoy>p!$6+U&R<-DAT-jJb zpKTvlgZWJ*6gVLD$pI=)#%6A#7lV?{x z&OF`qM8=GIL*uX8EV`K=Oj!=LW~tB^vz$Jgm3Yf4GqWIst~ zWKt&2O4*e9TlLB$)W02R!d%WI3r}(>A^XLIk^bkK((jU0^^%HrSz^g|nFXnL)P^p} z^|+8S)VE9G@D1rp$})a>E@2jOI#tqz9pBU9&3(_=@7&DNx=o%$`;6Ys&wTBs>GIR} zNB*QRNKS0y*mhY!f!N0sbXGOrDQUgqwo|R7!5&2oR8f|!4*S{{7I(9f2}7W-yJeGx zk1S;45j7PIGn)-lP=`^o`n{5})+f!L7VI-GK=RBi?o}yy%&o(tPKMJV4INV^XtBjX z^_rI1aW~J;mM?1S+zbEg(uDT^t}5@Pft&190pIAKt)dE86iRwKSM$y%DLadrjA?B>5{JQ`opBoP z^jWLiMwe?sMTM@T)DU_ucXdcm;K!R#%G>PHl~1MkinlPIW5N=t3+yXRh_KMp zIZ!P`HPr-Uv1Q|+T7$LSb=%gIJL|yjh1AQEiK0B51hri9We@17cQ+BinuFJU6lh7z zAJX1Sf*dSe%@|xY8^BfX7aHvPl}w#wkjn7`v4O55Qx>^q_me@0u8`)iI|;b3{OjIj z>ARuh`gqZMDxSZ{bRLUqsn?fdUxL&EzcPZWvSwv8jfTaciK378mO5$N=DBul$)1KE z_S}q~g)fpFDK$~%Uex=&jh@ODtLKKbX7yZ~iMXO2E6ws~SUttH1}5 z?GJ#5gRdaVe;IrN$maiIupjIN>p=;81DRgu{67TtfzyEQ_nQNc0e1)gfeimL@O$9N z;7QQ#qdCRZ zfZMq7Iw%T!^9ZVp-a$4~eRd{%+=_C`jH-=EduWt34su{aCo(FRmDiN3^63e#Jt^Cp6c-Cpj-~iC{$8Q= z*C?N}d9`h%a-kT(nL=mMx%&aKGKDgQ6jP&wA+`=5pP5Ho^1IGe|MI3?+g$wdL|uy5 zKQ}ki9$d3#oM2+}!!<%lt=YYE^VAxDVGaLt<=~o;4I9>toHWvCjt-BWzV@Wy#%b$M zYo0nbIyTnWFudWE(UVWz&{(&jd1|hJrg?ajH_bs_hv(<~9?#UjDK<0dZ92hAtu4`{ zs=1S(dYV~PF0b%g2+9-V(}brR7|tyXDVzL6H+8G$YgevRL` zc7zeH=>*1kGrI5u3wzJ84YHuqS<`E^rfd5rX1rvrqZ#LFS|j1c4pl8Tkt@@7HOEM^ zt;ws%UO_P(W-CQ_#5%HARo=KUMy$(aGA0gqFvul`vl0pY9K{Y5txsHM-tjtLB1$J! zwk-y)h{@3&z?!a@ZKpbL5t`LB+t zt$Kxd&^ZZuXYMnnYwlsuv}Ws`h!BGNoc3SVXYFlnm8>)Q5%!I+d532B!@~V<$t!xb zFbt!!2w!&p4*|+KN5oaN+|EVt^LANllxf7K!T^ZpB<4E_T- z|D)jP;5={^crf@h^8QD`)4|igQ-R|8oedrYjsvpuUk6?U{un$G3a-($AkBx2atY1cL5#`)bC#b9|QjYbT5E?w4~x^ zMd4}-i7#!uYU~=2m`J~uOsGT9u+w6*kYn*b#ltx2CG#SlG2|9*k_5PSn`=2!iqON; z&GI>@Ck_-}*yeKNY*Q1BecQ|sat>#Qu!S8IwRZYDYG+INx`zfiCm8&$+%G{KsZHN8 zJHZ*PRx*YbE3y|U&q8P7bsLqXLBFVaiN>JTmM1M7MeM_lTyj-$!6)oJs)+0?TzbbI z@zBa_76j9FLrF3xz=>UfwSHaf6Lru0F@za3_8pZ(Y?HPE>zYZfm{H*vic z`mkLNsY5N(M4{y9Io(k-`T|ALU2V}u2M5bDi(K9^?b;jbIK07a_>y~&X|9C;R|pfj z8wcx6Ps$abhTb%q??fycY9dU9w9?&~2@mUYbG@mL_KUNJKh$0u88qea7 zj~w7;!i+wSZ{Z5z$e92LT&qeU>UQpF-jCk(bUuNWy6m;2S(7PhQ0rMeu-LBgSnI^I zVmPRl?)01GYenz~_WDrq!Sc?aZqt=a@~g?DN%Vz1(%4XT?I7#vDkGJ#|8CB`M~ z(-RxBFk2`G8v?<8N$|NBBLj1bs8p&Q;`ue{p}i4G5NKd zbh^#HW-MHIzv|RI(lnKeqUp9*fLi zLzR*zrLZd>DxB1eS7=gc?FM$`X?gP_9s!ZV@x@(kW4?r0HF}}O3Rs1JN}_a8I-Kc~`D(C!YOmV-h`-mw*vNbMUfk`%AGN`hBz7 zcR(Fz^ww(OJBn0+c2K`d54A=*`lwqTHsakgF*>JAcH3ANOAktA`rh)elU7bc3bm0N z0=aDQi*SA7(9Z-NNjf!sBP`mqTg{1~T=B|y-_o>W*vYW1PR-&*Yt8dT(Y3RaezN|R zPEQB&6g*y%fp=5iy@Xemor3ail0GqP;h|D#PlffoWKVUOg&D_ryN6sSeRK})>h<$& z9pUm(8L-QhK#0QnFotAc4sY8y&~zfou-C%Pqf>F)c1m{$*P6<-qXRnp{1(6 zzPvR|_?ofuVr!ns#nrA8Guozg6`d}Kwzpc;dU$@^R-O>SC^smFlk3%+js@6#mEYa2n>$0eq?(|zynZMpc6EpDA^ z&L|6?0XF1Zprb1;>@Y>32$3FWRMaopPD0)KYNVGPhzgmoQHFlke4NvLJqsocM6g)O<%o0W)ML~4A<@lVd?Xy?_VI#%ijMC zZ~$xuKSHK|HMkVq8N3&nehNGc+=v|iL{I}?M_zvgxEx#x)`Am&{Pn*KWY@nG>;o5q zmEa-Zr^xC*0Y3)v*?%Y455~dW!QH@dKr;M`!Bas4oCmgobHN?KcaY)#5xgEe4LlVn zZolmQtHIsDe<0I;Fv$1n=fIUZl}aXp$kq9T5Z~l0Qepf0PM9*#W3`1@mR2=T&T6E4 z!57e$W{Z8OI>wFb?%P~hsH~EZHY;i7ygtGZH3c zY^?Z6*joGGI9x}3f1_TORBRcuG|NftqojMQ*FO|oznKkS4?Zy2tc}i3%~T<|V@{2_ z_xfDOS<>ElHDK{5j5fJC$-!0r#Yj%8%Zvk~SZy({H67Yh# z2v&Exu)T$<5BZDTHtdY@fwncQc3yAwkWIc^+cFa`_dK>anR8J-++3?``GwiLC+bgp z^s=KMh9`cVxerZOa>26MJl^dGESc7t(Y-z^;ufoN*-0nZkWso|)rrCMwBwhuuLpgR zp1T~656mR2Z)!DeQ{-+wJZH8wsi(Ia7u8~);&-90YR`|1;8mQd~*Rdt;`^o-4f06sg+ksjk6LhMJ~5A zAOCAbY$AtJ?WS?p9=GLOGu#|$NN=0DKpB>lEkUAMp_ZNbiri=tN3R41Zcz-bX+Y*@ zVkgddxX<|h%1CLIOwIP@O6{wc)A8^+SP@%E8n|}_!wD>A%(2^VS`G8eL}frwFq$ee z6fBLz1gp&y&86(iN!6?p)U8+58tnsFEwIUA>f8P*HteJt3Tx_Zs?(v}cAY}La_TR* zSA|k#k|wqDGMUQ|VGUh65g8xK&};NTTQSVKWSAsRG7KZYOE8&D4D?zM8Vrqqa_iR2 zmEL6<%}}8vn_QvVKw8tO7?AstL3435zc3LCJIoGB5mzUqgUU(wp-b8Kih7t5JrHN< z{^=X%m`&Cy6ne1?9AOvIgid6~z+`)aqMEjfAW3iLnEEmwSHY*H_tyU_wKy=-x|Z}` z_9GvcPLiw?krT+j=j9re?puoPJM78RK#Sptb1m-(wUQbW1%sjNGx?##AbDP zU>juvDy-l|z9ZU{#`UAUpS8$4xy>ucTyMb8O-IEELICx6}Mn}E%_=251O(CD1*bI$UxREm&;RccUEvkP`z;j zBZL_efPWi=)4SMNuvq^UNCs`&ZLAwbekty3v+pUJs-X(0PEh;I7~<;J?ua zd;d1wI8n2|f&_Kpm8T^aO7M z*Mhf#r-Eqv-^|}%J&wKsI^+LJAicqr;6(6W=m2!D-`By5!4tp?*bm0RSwQjs9s=$L zY%C}?^U+*D@^%qG`wkz;Jz);(oSt7bk(l7>JDO9wxR$ivgp(oC-q~6K6C<3u+20yX zWtjFnYHb|(>vp|#ydLzF9*w^3bK=xR1XPK_J{rn%Zi-^UIq1l zQ9Dz+>gANQ=)Vx<#x`Yd7I~H%x@xhuAvhoVan>-Y{*)6)|{jEvGKj*cKEyu-U6=VD-UE1b0s$AwJ zr+N9@sGY9# z%+&Zl#9CPA77ij5yzY~aVJAH1G9EYEsV5>?!lNuH_r6c^-Q@OmGEyhQYVp)F&EhG@qYg zj+a!(HrEJ+F3sek_-Bntg$fi$0C%N9SR=>R2{WwVrfTaj30G%)b#r!BH&j~I8s5z~ z_Lzr$cH$cs`KqRJ~r)#ulXb0cd=wNajq5H$a5E&=1kFPjaT1>Bv#vAY2y1E%Ohd| z+O-jkcvaS@9JkZvB^9uM9+<~s>iWUf1i=Va)$B^Gv}iR!WiFdCxaiO+5yu#|_9= zly~k)6(r=#S6ZaU4rbqV2sz;WTNA^Pv zH2G!6r33xW@5Bd*tI)CS4hQ>Kd@W2KFP)4b135MC{B)3zIY7~zMar78BEr%`5ymAi zc{Wx#doyzLqRL5_!Lhxet%b6)4B{jHX?BsPtcN|xnQMG4M{6sUDo-LR6ltMe0+bE}z=jrQX72=}K>qoz;0 zH^iPE8p2x>jpNWzxf| zH@+>AAg)Tu|K}iRR**X-|IY@Y^V7)suLXYzUIU~HI179adH=iMtKc<2dVt>oA4ax+ z7WjSeWbh<#8n`>S8_+#~H-Q%c-TikJmlL3crJJj_&sn6_)YM3^a7LM zRB#HAuHYWvr|1P<4PFN1W6%J5z*g`Z;N$29ZUBD)_JG~sEU+5L&)}Zmj^Ib=2tET| z19W%bVK4*sfm6W!z!vngl2bYUbeH*Xup>dfQTl2C$yDP@m;KLFBfhW5AMXd zB=K3s>=YlVS`f$NkZ;Ht{k8EuF_~6WAIXAIDYI5wZ6CAg{N5A;&EDVJQs?Ws@-(M% z9h|Yvs}}nZyLUVe7WgvALHH6rGTPttF#c0=?s}qa>XaysS?AU#8&@vcupuhK6be`N z0hdb5_z={1VOFFObIUq-(41mRa<)yIrfvOL(-vnIT@|pvsc-e~s7@BAA`LSZ(J{S{ z!yOueB(O_LwA_R+-BxX4G!^Ik1vc=ozcj<{xW~kry{+Z);FL3KA#?D2aiTft)}rJ9 zs+VE+EXGon&p553&p2n2gTlZY#U`55j*8-ybrhg%YNc$;lXImafY~o#^QyMe=tchC zVd~S#poq(;J4Hh+oifmel8^l0qoj%;vW8mR+Giav(wY0vN+jiywI%#uAX_6QV%_@6 zSuLMUCT2kq(8^A;gzOv{wp;0m6L!%^1p-<;(r8*;W3&*lL}%-IVk0*g8yZ8IEpccD zN#JY=d6doyCs2g_8R`L%kNp%_?r42Zp6mPKV(eHON)+E@vSlW=6NSW0EAZ<|uTR&x zY4S_SzUN1>*z_Q5`0=~YouV+#rpGV(lL3~$Vn+EDSbsh8Mx}gyBJX4l-ffczgzEih z+@7H_eQjq7$TRZ6$18fDiPUyhQxgcDAI=Cfz${TW)P$1fp87#nivnbv8o_3w>FD9N zQc{*y#x=RbCd1~gTSqOWR}0?FCUqP|2@z>#R;twF2#-OoRLu&7hozh`wib&{QXtP! zVk8M&7n|)yvHi)5-T|sgYJL)x&daLNz=Q%Ao^5Wib18-X%7i5;)|Ci^<}Mg|MgkTV zcg57P=L;=$hKtLwmJ_3C^*W;ODAC!S73#2dB~9g|6LGS99GBJuEZG&T`RM$rDs=2A zachA3j7^NshYOgk>3;o7Q5MWEXS!!-NCilOrkaiU$vITZ`DG}(mmlHuDHzhNDXB*B z!$TZVuyQTLx8XYz66|b>_A;BNvN?pUFzK+NQp`%f1J4-!Epy6|#Ed65o#1=QY}frP zNgheI!En-D&5}JeR&LnK86Dlk9Lj{trFB)s_fFRwO%{!= z#*sIs=A}`OoR0w3v!Z`nMhh{Jw+SZ3z{Cnj2gc=HFcj8uhG1*4C8lb--5=GJHLj=# zwbiVP@&E3HtSI?^hF|)830eQ$;N{@ypbkz3Cx9O#@5>M1{or-r1>h>s1Utb7ptJwC z2VY0_{{VO!cm;S4hMvVd>S3VUxPOQ-4U=BoCy9E z`Tuj^TJR!pHP{O_f;HfQ;Qru#;J)CO$p7C3*MoP1cLC`O{t(Q7ac~LP4isPTVc>Uw zbOz4`&jLHZ@!*r_23`qX0bT@#!To{m4R{7PADjnH2j4;d|16NM;2q#OU<~LUfd_%F zBlEup{5gF*oxGlv65=aOwOpj`OaMJyIepV)?G#gU=a3t~PXK65M>B0^Ws0*jok2+d6A2Q5ZU(fhDGObaSZWF9@Cja*q)QlAS;BaDy&b) zUpEz~i!9_Mswn+k()Z-kC67!)J#D+T$Gf;o&Y8qvgAN76K92=?R(ai9nA5%ET&Ba6 zB<)gQ7-UaOn?hRFQ-_*Qt?tV4A1!Xb3#YFlwNwH#5l2OSpn43?ib94(l~!9@e9FCq zp3C}0_;_gkG2pqaJ;}}As)_%|vObhJT~dwWCt-@RRkT&zt3Kg*s89HBsj7GquZm`E zDVDAE8>w!`*#Prg3M4XiM}~m*MbMn!K+#m9dHUMfyE4K`*0c%syUPo0v_RlLWF~bWs)teRXWFw5E!NSJkP5>DCh( zTx5IlS}wIN^NERYT+}JDdVOD|Lo;dzjx(C;YL_J5AO2JMh9pZj(6_{%RaoX_9fQu+ zs-$u|YS5*BmMON|8(qulR`D?QK`q*)=L!kDzDwlPJBy0HyF!N;C4-u20V^{sf|0i;@*9cYtDT)56!F`x#_9N_>Z1e-ln|dOIB7i-h_{Ko(57f&*i?kFikBvTes9EUwQhD$LSl%hE#Z^%5b}8 zeAm-tO{YDRc6`ypHrkOcUzUOmgqq|##jBF}dZ`8#+9LV?JSLyw&q@BjGzg5pM9%*& z@M@sD0G{3+N8ekoZUd>ecoJRfWZKSV}El2#1Ad51{&%1Wwtyef@9O)#{JjU93x2}?UjzRJK5#qj z?xOr07Mq_MM$kf|6r9R@EX#f{hU@S8OjuH8tI-fvJfGSjCd@#370`g4Q8DYu;o6}f z=?rK&3?wsaz@+!Xm7$?1CnYD2;Vpe-2KLPvQY$Zh6K?oDD&?lV5_3;&>t>K2%D4t` zYA-OzR-a&ZKiVCT^ipRJBwrnyE(qC+Qw}G=#OjCRn?X+#W=%*cr|xtw&fT62F+wEocrQgR*< zfYYditO^-wyr=<>rPCVoB2z0u$2%tc6?-I;Qy1AKiT$7K2|@f{LZmRKb=6kk^9IWJ zE+nD<)%huJV$6X##BA2a7o-Z)U1WVi#H8y*sc`jMpCzqxUG)I;S$abNl3jTOyNnLc z;;N3rdVe0>B`aLz2VGp^N+)qKxbM}<=eHIVMhdNsjJk{H{U&D^Vj@rJJZHl~nVQ8p z$*+shoADGHDv!3B?P&sr6N(Nc4_u*y`jiUMGRvxt_9UPJzOGqlB@hdcj73j0g1AUV>-sdeGtT-AuYTEx) z3=fnQ_d_}(39zBP9nG4FW|{W7pP=h_!mKvMesS)=rtPqS*cyDLa=NJ?ecu{NsGOY1 z^vKUupDWRyOS-)Btocl@y?UK%EEdv^;(jW^aM4ntzy}tHt0v;hPBC`7nIrB-DzKay z!ER-JPeyg4*>D~{pkZ~yN(Z`v=~1^II7tzQy@WSgpHyZHPionNOX`3EVZp7gFgap< zqGQA=e^2kaHuf=%$@!p;9jip8ZhVWtc6`fKcJaaCbnI+62g{6S5SJigF&TGat(8JA z`TxxFx^(1G((!lQw#hfpTl=^8%I4a$7~d& zR>(&f40WlvmHdmb;>oDQJt9;QciXuMb`eU7{o!>~R`H2=9NT@U?&Jj5(_j$mG$1qV zNn;WXFZ^G_%~d|LYbhhNXL(|jIbx^g#ri}Vu>_xEyVf$!yVXFOly!kViTe@X!~VC) z$A_Cr+*h}1BjPHdp6rl_?YSPQkQorefNE`Y<@0Jf0}NLjj1}hGB8_O+f>WJnt`12k zX7dD**GqO+?x#gE>J`j`CG|#*n={9^yr2HufNcMAFb=BVKauIb2R;w}4ZHzd4bB3$2giZ8BiFwkybcV3 zQ^8M>>t6#d0=n<-i^%aG1s?(P;34298n)fukYy8= zl*-;gd+fxft3Q5VCN?^GSe5Sjdf$t$>3k9=hN_r*$0n}8@-}>VI619E`AFM(qEC+s zdKO&b%-!=GPO+rnhol5O6~U&28qo`L9v$ztFK$TaKhHMdy%;R6{2wr1yp|L0d zJe)_xTkXo`Y|AOVGPqb_UkB;l!J%**Xiy|DbByT>{lD4=ay18gh$P%TbBJl7I5$B}zPr_#Ie#va^O-`$w*U)Vp>9 zJ01b*U*Vc6KGO=M6{a_BZXA*RFR4wMPrKg=*@To4!Y+=n*hJC}2kkU>#*?|vZ|f>! zrX`!Ir>(v zb|fqs8fQO`k4DY7O^7zyGxCt)6HOHd>{#rkG74v0Z^qPAZ~3h}ArzLW?0vf9=3268 zY2BesV=JPf(;pidW=p(ZO+us%O*(^&JvQNTgrP_EXFV4#dEnSC$}gxf?Y?Mwsu6GS zva>JEX*EdTho$e=y>B1mt;aZk3?<4MD`H(m&*pr1?Qj4T!>9K)q`a_7v^F&JxKPCTf~W{z5F5gA zdZbu|T?veX9W+y1|LKsi>ZtJ_)sppV66`tw*C~uPgNl`=+7{)PvZo^*2C>KMiH5e< z>#{AWV93YB6t^?E^lgg?8Ms(jU<>p@rO`>#MWSI{ERYj*y4U6+8>sz`Y1JxsrW8DizUdsY+m42k#c$IV$a;TM7gPTu1C3+ zpli9RHsfR6)tchaI@2su@$_Iz-1X$Qj!M!hEk~bh-)*Inm}gRt7iXG{0ah6%;P7mV zPyo=q1!gFozT9ZhT8LS~Kuy78NP7a$?qksZT#LY+>qN=zBH7mKNTOZlC$ zeNrkI%ji*?XlS*Lu|*wU@Jbz>>ZF~Mv+`!T!k-@<1ES>r31m~*p(X!c7Ub0LBKLm> zya(I>t_PO`+5LZj%>QNZ2B3QY4g>lBuLhq){{I9x3?2b=?*D(m55VieF7Vsnf#9y- zdgOlj^}hFE8`%eQypboZy^T9pAhtU%}2k4yt2Jka<1n&kf1%C!E0gnY+!MWhx z;H&5iUIpUa0T1HuyMeocccLr!Lm=J31Hl8p=g|%P8+bJs18cyK=zI15AA#>_N~Vhp z##cN|x?B;IrE>4K%%n`1g+Axwk7!>$KYy)2q=My;JL}N4{)A&3$E91*JQz|c+vE1bDWJeTB*G!#91Z1QP|b{acsM8d%SCt3sx~dox*Hc-jSJeU2iJVcm5znGAb3wBqJ)8TNkvY=$L2GCl*Tpo7s! z7xFFGEi>C?B41{=N2{xa@{l`R+!jLRmv!68coikNM_JT`>?})~`p^DePkXA=?RdH=~3LACN>ev!;FH8 z-KG_uw{3kVw>~mll_4%J(+!dCSw&Qt3B*ael+aJ*Gp!Obz3%FmrTY_y;82G z`khrba~WmwJ2tDym*~;j9@qh-kdt_H=x|B@VWM)!pl)83PH$YAV!&D1SYxHG=;D%? zr6tS;b8BrPB5=epaZXJf3$lp|hXDHY?pkuMO>Yiu*LKQQ(Hz}EC92+lRgrqBHADoR zox{6S^8YT${K%n_|Hp#-|98mw?*uUh;8XbfQlJ(iSz#O;`oB|bp#LK@2{f^cn)|rSO>m^{QqU} z7vR<4Rp1J65?Bv(F5vax8ZZr}z$ADmcsqK4>%f&@5$Jrt!@)<8|6d4R0GCg7P<&2F#^uu8A zOHf(*+%miSZF`sQwNDlgMdg{xZkr)&$*frlEpmZaT4N%m@{0C{j90w7PZYhgqoN%? zR=jK@cevTTd3((JKIwecr7SG?9#f>~$)kyodf415D>{+Zf4!)4LQjto`}LxZXMvVq z)XCI3+Uq(wlOf5Gz@rMA-uF^YYIXBhb3sjSQ$>=n^vpXlx)EQ>ahE$f!gz-sC*Qe8 zRU%hekxvNSrXy!^v&}4zjvDoQ+{N<9NJL9_-z^s3meg@{G@{V0nQ9&@!EM>z&I)cF z_3gLQ^UXF9W;5?Z9`D*}nr6=l@T^S>QLp@j&tY z-wa*?o)5Nx^TByQ=loxV?0yY+G1vuuh`fG1csFHxmGjJW)3lz)$ zIH2auHY`_RByOtOO4LHzKFM9lQnX2Rhq71}*`21m8kV{|0zHP%QpU zK<5K=&!5f+{0n`p{(cL1EyxS>8fYmO$1#s=%(L%(C$!m88OliE9rE{3S@(e1{a?hL zZA_uCLgn7$745PzE)F5%pPhzfuQFLw-B7Qf-HM632`194ZBhIkdM`Sh5d6H-y}^M6^*FxhI&_?93q@hQ%gqYyK*lPSD{?kecLQd_p``*95M zQ{m_o2usX4R_W17>NjzoODG<)PL<~V)4Zagy8!GIsE)QTaDh{Z&gOOl{5i*G33J$O z`Dh;MC~?8mCD&KZyMynLiXXeVl&BAacgwFSe1mYU zegmk_gJU?8B3b6fxk&00%4!6%gXky274;bNZHW=kyDntkCaw|Iqd@a5Gus=E0=e&# z(08=9JFV}R%y-kXEx!^|t$tFSTX#7wN!#NU$((88=un5TAkU0VbEv|OF0t|I6opK~ zt!cg0XMi1xP+)^s<}*KSAprsh5;pWM@nbR;3c0Y_dC_o}Ns^b@rxZqF;y{xqH2G)} zfw^fD`kza1r_0&EIh>Qw{;y0evo6O@RoDN^jNJ;PEC?KC|6h$5 z_bKF1+5bm_{Qtkm`F{@Prh{aauf=-$7(f&WDQ{~(Z^ z|2aT*{)>Rl1KbzrzQ6CG6Zj~2Gk7{^fD6DQfMNkkU+_=hW8iPWJU9bLSMYbB4K{+~ z!AH>tJOex#JPFK#i@_bhx6ljde!qFp0(-$Z;M?c}6vzK-;2Q80@K|sLP%OYVfj5E% z*apTzxL$Li5b4VVJ~_xpwak_n$aGM}pWFRIdN+5@UWTJJ;~T1ii5!IW2& zicZi$VLz3ngH4r82m^iHEt~YPRUsQcWL3-tGACc6#_;eg2h~O3)8GYZ_6i$NtBuDyu8RxH@%%kA#LiJKmWL?BLoI@o!ORGMxTRTTvv4e=;-K z;6}ERa!-debc}9KFpqV8tIt)s5Vd@fF*p3POCBEn-&I^{Jm+!5l}!KWw8AK^Atk+; zt9j>>%EoM4auqpLA3XiAA2Hdvb4Qy{QqkpZ@QiiqM-NjPNlrPOIbSN-batW{)7s35 zj(dY=oW?tS)+*P-xh7Op=sHRb;p=kYW7;0qHb%qI4KjzEYo*GkQhbO#mCrHP1@@K4 zk+9xM=Rmb^&`lGN#kM+gG|n1KuwCMW>zMMK-z~K&U7*!UaLd=r;!dJ;oCL;P@?}3> zrgt}G?zLUIz_5Hd&tvboXTOe4E?H9k_?!zvdcV+M=XEkU>6B)!y|IC57P)5klR++! zf-14w&m}R*mw!EbB2Vvzj_c#acm^V^`KI`2$#OfFxu#P}A%jYGta4}s^`~a#ArEN6 z^t36*s-bNgk_yM5t#QRH-U!ASH(dh=w=0BYDckjoB2nG|;h zv~4XEPGzR#QgklOQuNGAYPR>S8w?kSDgUJW?9NSm8`Pw)2bL2{)7 zPdUvxaQ(^0vIEb7oT>{a&aqDPU+4L+z5Z(#zdHL?tF+p_GW2Zw%Gj}{V`FT5b%cW? zPC%1zL~6Z1B4MvPJX1x|ZRaEi{I%V-#vKG4yKjb*<7!TZJt$X)b$>#+HRAy;(1R`P zZ=y}*Y_+tqrZz=Yn|&qhIwfTlKQ~Lhn&AfXImxYBYtrBb&MfrzZ6wML5nM*uCC3%} zBO8X{ijk&uTDaHIB-m7|P4p%8U|@V?XG0pCTetX`oZjb68dVf~EKJBYLBhEF- z`{z+6vgI~L?JRoIW2WIwJIp6=M$p#G_P7hZb=EH5h0$DIr@?uF3|mPc4vyn*40B7N z=LjbZy((!&gn3hNEb>b3nQqJ=$>Z_8FER)1uw zB#*{*pt-?Ku3ZhI+4abf`DbukIJxvQtPU!t?IV(|q*n`L(AzSRqj0KMKi1Gx8x|Rz zj72p4(v4Hg&na_#p}S=jmHmGw69~U@lm8zd2NU4q!8X{|@*mGX6WkJHXq)0+3(8oxr=1@Bak62q+%F7Vs@(`?rEOf;X1Uw2rEBX=9Qs27};pI4kmq?IMhXRWSSr zq^aVU&r8cLfmt_*z;;=5RcdU4x<}bY#W~wfd90>Uea=CT61BUwX=-%O)Wo@QCm7VV zKZ*=ISe_W4CMd~1-36n|(G}?=$3p6)wvY9v%ngKW_X62XBy=;D*tEIlEW}M%Z*$X4vi@4Xs3yX z%!OaX;%=9B@7z2E_sxbTm#4Fi!BFIf07E%@(~SFO@xSg<}O zj?U}4ZF-0oXbGkr&HFcejv+#he>DvB@8SkYmyQaqcS_b%)$N;mx zL^Io$tZax=6SG-^ARAdvQK0hztv*?|PM*JVRfX8Pm8&kDzN%8YoajAO+pAaATtiqr z@x-wyoHP{oHl#~ubNpuC0d5dNUp&&R5?>dIe|2S*u7x{T(lnE>7^C!>0H2T38#5vO zp-R0nNN8Mg@J)#S{Ex%F8|B!DR5fhHsHjvwtpCtpc_R=9szJ@DO)B|wmFbAnG-ftH zDK7mq-(^lEdUoYQ8Rhxl8SB>--Vfq1F$QDB z?ZaT%^B{w%W=Vs(q)oodq&Kgu&TREKlPU`Loe9G?q%Em$J8%_wChB?hLttVw;$=S-H^R?>Bo&g)!?bbhPE-0-KfLSN}) zRgbSTkL~}GeU-X(qVUEv+Ip-XZ-E~E_g?UFcrlU=^5PR}LGLQbbjy09hxZ$q%WLGmT` z|HlR)^V`Vz9|nH{eizJw2KYWQ{yzcT1#nmJUF7?}0KW%L1-}g*2p#}Fj$A(n+F%0c zEdRTZ=^H>X|K%-~|4Z-(U6jPPA*$o#$|I7L76qCIR%{zL^r-IbCWKVVJpp1*J-Bxbb>Y&2Edi{Lc&S=@F43rh4TY7%( zKai`sE~YT+0RAglA$l~o-)pn^5s&b(B%Bo53lGe#;A;wlJk~(xRK0q`+UYiZDF2HX zVL`ov{+oHQIAfuY5!B^j9~|0)zIvkFCik}$Ydz8Oomorh@gm(_n z>4PkS3`rzkbKZ+ldJaxpS({lLl9hxjiHP5&Tj`@pJhbZ~gq9GZB71_!2EJC0i3tU$ zTo9&l#)zTaGt+IriJ0|`1$(#t_i4ggT6Q*q%<4Fsx=eA8+U@_zbX# zLTn{nsbPM(aqkTDKr2A~qLmZs)>k9F>@*Y3)rEf7e4JA#rv>73XUL8w5!e0UAjy)A z^Jdy;BGUW0em_yJ2XZYS#C>bT+=#p}oBNthd2(+!?x1CcDf!xE!6>m>s9; zC!TnK;34DflFS%7s>(3Lxe*9W6&64xq%7?S>N8ZAPKQ!Ilkj$+k@b<8ne;mwjU=d+ zl@ii*Pa}&72OZ_tlP1U)kUkG>_XqXOwRod@m&Vj5xnFc-v04fjzT}#+(QLGyO};EO zCi|@muuBAq^GtfTI-6NGRi@n=*LACjnYR6lWXR2Wyu=75jrM^I`mW4cmYz=GpYXcp zwwdM>s>f|?6p7YY!|zRo^E@K+`-)@-w2IrRSZJ4rlH9x&{%>`q%3lc~5%KjvH~vk{ zl#$XzYg(67AAJhB= zIk(IoT9#?H4%y~oBc#ezZ%8r7iQtimF|Lkt%B1r2(Z0_LCDgq-^VTrztDbh9ZOZiA z-19u+ldWN5Ng7vUo0qQo%qr5$D^A&9#~EwE?oW&sL_*%nC-IExb||HC*}lp_m==+AsQ;RBb=}(aCkHQ~ zl%OZY#e$S$DL#$AS9r+Ub2hIwMySAm|Hu3P6!&iqTm()4lKDRf{sPp&Mc@Q*4{&$z z9pwEFfmefPgZ*F=kp2I`;C;yc?*;Dxj{z5fy`TafZ_mr7n%N(;2l6V{yzq~EAZ#Y@1FqIg6D!w;6IVyUjz04`R%Ju zpAPPo+kg{(MU?A&BB^Y`;WG^N@(DS?-kJ@YT$~>(gCf-Kxmk%u0DHLny8I%|$e1$l z63+2wRl!>&tULRzsF+SeN-@W7zgHOdj23_#bQP&0U50`+pC$-rVh$*|Op>Y@|4iL_ zRjtuJ;H2CV+7@Q_nfF&Q-M&p@;?ZWRGiH$tW8201n`RPvU$uLQtWN2+NjRag~6Qer{2dj@Vv^}p&Ggo?-X*5HHl5BE? z%J;Z6oyq~ZFBvqKM>QeB`%DDn!UK8k!N(=ac7D~Z7N^NRp&Cb6kCy&&VXAP^R?iUc5XkqG@y`RjT+g z^|$zuQhXABYm>$~gsYC?llXg#JC{1&^K!RJJJ7pGQyy>zVLLq6K1+KUBb5>4eBZ;v zFvJfCy|1aLlVO~8%~SbUC;eU-J>ypnC|8>zqTI7h_JC40Wj}vwY$uTYm;+j;KDfh( zew-9i);%*^8T{HC3k|Mfn;wq@W(5H#?SZY+2HTBXSLym@W6Gv9?i^f$Wei#_bMdGQ zMHE?+{G-detr%@{x|O#P;FGPS-}<1H#_3&WRfPL(u(6;T#TDK6bQaG z&@;wP<(9KtS(6I*Ec{dd|ib<>u7P+@c4N!Ldoptj#S1c41~9pSmaumB$1} zdlXpT`>;3>po@OxAM>5dMRn(vEn}qOvW;8ug>5c48l@0b&CXK28rxyyApVnU)QYs8 zs-k<9+U9SWG6SJR9s6rR>A%9Rvxt2c-B#mHM!l>mT=hoJGA)uKi_J;8OjQZ{V16mE zlfuvVmASU5rdjw}2wC)A4K8HYO)V2Cx{>vC>1EW1_5)irxHNLo7@K3p=?c{oGEGl=Dq69;>U)ZnJ-R6k~~S)D}zO~{&> z=Y_&!kKbyUZ0&S-*5%ntVmDVt7ICZ)?aUN~@$EjOX}LmoS1i`*EVJ?HhU!6_0Qn8Aa!+t~$fy_VuzmzhKdiCcZNeF`$!`HoJ^WTLD(Zr zZ=J98bTue`p}5RDJA|^txKxuqEvS~A?B>PWrLzk<+NCo@uFYKJourj6H0XA-+C8#Eqmn)Wc_St>R8-AR@6`?{ZL7T$Iq6V zV_ppxua;%Xaz?Wh^V4slz`7L}J7!Gk#-jnw=FX1n-!n2hF+-iDN<)(hNf5~vLu=L` zqjYAtHTl2Jxc@B3{|)}u=Q?EmA#g8n1M>YMI2rr`S^j(Az2F+~954^ogYO~Be;0fS zyaW6-cn)|rcovY{eTE z_XD3r?!O+q0$dIDfyaSMfOG=?jt=1K;A`LpumCmy=>`4{yc6gwz+SK&JP7r+iu->JcmX&I+z;Gv9OoFoDd4N< z2z2)UGH@x7-@o(*cLN%a4*+SiOaFKm$+E^BXkxX1s-nqvklBp8p)gv~&zO^fu#R}EOrp;ZcswXFwXXa)h$k4z# z+HH|S+tT-#JfTPS$Ert`lt=axmNVu!337hrc(SK(6Fa8fVSIbCujEQ6V~*Zk$g&U{ zVmoPwu_h^o(_?Tt2sCVr3Fl#s6}o z9Su|;gs+YHn>HpS9d~sC2JZa|JJ4>`H5^s%-CaD5bXSkr@#fHA)4$4dh zI-9k_HLXcP_%AWUD2%AEC~KMOTkZJ{MVnh;et%hn}zj+JsPyiC`&R#XCZ zgg(^uJo#6}l+-Qy1QMgV*WAP_rDRE8sc}is*1A|heOq`#`u12}N#A$rnk0dO;D{A7&$RAG+iO)_m;U{UXfe1l_qyOUGGbZ&|@KL^A`G_>oIy*qbrKHvJSzG?gR zotx{McV4)4XsD(*A_SG4lbg{&?lrelw4oKYhDz!ljn(dO)>$-OWsf%7urAzDEvK3; zYdmBkQApRp4KJ27XBduj9@aw|4`xojOAn=5x7ju2$&l6+J~QbC{U_~L8;O$I{I!zQv-+xZqeJ zaK_2EY}m4x4~A+Y`$g{#vkxX^Y^-pwnzKf=_Q7%7*W&vd^>S{pC$*0<@2y_{P_P>~ zkGugma~*&=7@eP*sUq)mO*N1c6u9JerzRs@iI9_FKbvXec$5Lo>W4rU%Z?(L? zrD~NFf#FY8t@uLEY<|Xy`6h|W5voT`7dD`9>7miMupP#M#Jj7x0R2O-M^-S(*_s_+ zn`HNu>P3kFqeMSvI+Zw*9*lbh}86nTLEi-Pp=dsPn zoC^duA@g4No~TLj(aVm)Ug$*pI$JF_Raljr%4FkPu`D}2iJe6x0fIDvS_<_G_oSI` zEZwf%x0HP^XpZzes*a5{4mumtlo8g*b!|zs;9M3=9kk}1YDsaAolm=19dfxzAsK5T zD{!GAxwNUaD%+&^H%jo(=TXljrJXE6xlCg6_{`f?Y3{;Y#j)M8TYViWTFE^yUJ)WSK!OySo9pxmtOpMsFqhid=3h#k+=jbMz9Ki8#L*#vfm_s`s)<^jpNX9~*E0E^ut64U zPWJzC3IBXviroK4K?iUre*YD+|FgjacrbVn_-$|#a{nKI=Yo^LNkF=QD)wyVRWIn&JF*5#J+Il844@_Nuv%l4PB8 z(aCNz$|LpnOBIEB-XEiHYdmuM#n7=O<)cr5zt^Kc@=|&jg_?}bWt-$2i0sh_m!dL# zXUi0j=hoxa5Y#3TLOQFdxx+qc8$c3BrR2G%ENh397^lWOLL4rtnvQleOUCOZt&AIz z@XTbADo^gYc zc|O{6?igTZxlF9S5HXk-O{>@8BdJ7PDukZUMy?*U97$7APtVRborg0XF{W@mNRlpN z<>=NGEaE7qs0v!9CH)aUiJ`qUJvK2uA1-Cc%+;d6mtW3w&(M$xkeEz08{D|WX^Z?a zlpja9MaM+dkZw&WRzkmdi0=uPJT1hx5ml2};VLA_xi(MBm~IDU+Zr57I$|i7U4ek- z=0|YNp)RoeI+*~!%50atXdC$B$7p$PgvgZ zyRZk;g8zbsVF@Y|M68f++#EEIbMNO@a!4JQugKxV2q%j7w?@-R+p%I)lb*{&Uk;XI z8`M5EEw>6n2N~Tl7JzBs1j?qGy`Yz@#hLvx=`%Vz{2d3C446_=gouf%S9FP*ZI)SP zhFA&Zozu*I{OYt$uqaQ#^gF?fl7J7e_nfc4%v1;i!^)Xd@N{NcYK92IpkhoxMVGes z7c$S!g}BwnE(`xBGb7H5+=42uk*v1p z!(BRR7p-#(JV8pg>c>ZX4mU&inWUY z8Mx6gsl~6XZ!`${BsGb*k7PJQ)W>=VYw3}jnCS`w7_)XVL151X_-0> zmrmFhlSFwg7Z(ZND$yFBNn%axl_nTMsLF`jRz^73S!IoC15KNkRKU!QY~nF>T_(Hw z%&MB*k(U;&nXNLH%{SKWNHUh0B+YsAmhIcC1=CO3Pg&BUPB}*8Md$yU$Yt}$m6HFj z=9fN-{r`91uR#->4!(iB{}M0;n&2{^*ng|QCz1F65xf}uA$T&7U%)z00{@B3|0(cf z@FXw|E(P)j&>a99z{9|uz^9S_-w56SbWh;#gC^Jt&IP&~;C?`Q0>uaXB)A^D3tR;b zfh&RT3Xtw#4Y&vR20DQoz?;CIf(7tsAU}Z%6?hUj1XhCqa3}B$^aLLRuLe4M@Vnpum;muU!d?8Wc!T!@CxCl^A27z!FWdmG z1Fr*$MYsSC0?mN|V2)=_1DJSWb7bm@(+M0{x`+5*RTN%n!mOFDo0cJ^EQMWWep-I3 z^(k}~R|H0^L-=2uXimD8E|Jv>x#aRtrOj_S=GupXqW_j?W!5JfS1y_$LzS&KyVNzI z&{g9zd}S9n5cm0t55s@zG5oH~(_j}&PWCZoz(8f2VK3fe`s2N=V#*v<&t_p4w#7{1 zd+t;m46HGlT2*Hcrdv;F5X|`Gwd;J;(!REMQuX@2N<4m>)Y#4)*U|G&E!IVyt6h?K ze`M&L=dyGI-L~4Z3d_7KZ@Fx(N*cY68g%KOWs2?gM%S{sRXlLnfhFzIbA<$6-z9SD zokcinxI!i3ZuW;1~|+a*ki+KypjY#)uqV3CZIw32Ym@k`t~WerMaLT>7H zsg6zXd?Ihv&8%6s{ABcX14l(VpQ)hF3_+Bl#2I_t*GGIB6ZPVr;`eGor{_fcwyRM( zeyRE>feuN=XPxoj;y0w`#9uuGjV?Y7y13+VF(7dAh36s)-W*LHrm3TocJh^{?|7WP z5lCDnH-Dw$x$+1XQg&Q;9gFicOWsc&lXm1$xZJyEy7+~cC7;dEwfRoz_m7PD1t-@Mpep)e^W*n-y<&t&o@$PLK zv+4ZaB>iUZ@2&s%x-M?PT)$;=vQ4KJD~nA!Zam&l#v~Bl| z@|6uix+#afPi^mrcAEJvm~t)dK<05c37r4dD;{fG4J~ncXwpdhYKsMS^@av=Igec# zC)_F7m9x81CQ8Qh$7uOc>VynEc&)GX-U`BCysjWA{EG1=5@d#-2{JvJ3Pmi)z;MJ{|t znp@y7w#H;s5usCiY|+Nl!NY`;4$Le@jPzEs;;ERyecY%%I3?TXa zyU6F?0bc;m2Tug@^?xK71gC>vB9~tWUIvim##vRYxJasjXq4dxv&P_mCv%5}S$T z*|DK`Q7Io5^f}rOR0n;wO$XO5JWDjW?I>=>W!T)6!c=1J=I^KV#3K#ZhKadF-!4)^ zi5K8Mms~){a9ybMH~`hq|K;7uL`f%4Xd)NR5hKrgAuzt>6O$ZrHV9=M-;(J-XA#0K zBs+q>W{C!`FA;sW*aSy2omjUHHwKB>YZKe@h-;Lni01!jtAfeNCKCmiQTIxS?A#br2bA;u}I0iEJpwhqa8SM%H`T8H{+ z(QIRZD<%|TrOK48bJ#284Al&MQl{P~ub%a1@mSugDkTafWvfr9hn?)?1Xx%};WWRd z7?y4kpTxyF?ZlGa&+K^W?!vc_Dt}qAKm1f$CZ=Pcs51Jx@2?#yl4tD!%0mXjd}oF1 zu)Qm0&Ufas9!@u1ou$VtsFP)Pr}2PYyrZZVG7=V7%7?QxN9Aic5DY{Z|onz!6t*@UVn?hUtD|Nro~# z=gRXTZO%jRJVVqT8Cu8f&C3L-kEgg&7P5$pL9#<916kak(-(`|d`XpIX%{wZ&&h(& zWm*~XR}g=Bi`#KFE>6_nF`vw)AM?mG)E~8Ld+eQVIcE|_KLLL!_IWJGpNd*1CZb!D zp~&(Nw=&hxh-s-r9z67f!eQuffk}@0SxV4C`rue%(|V7MheuKA!E*as&T?lQ(4&K>H$SPH>=TYAHMPRMVnGI}2qxuSnaNm%(sau88sG0P#gn}x(4t1I6XSVFCNf6b7Q5Ul8^aTmFqKYmExM8A5hyY_B+Utu%{YhZ z;AtsSOLC;tp9v43s!(%AlLsT zcqUj6eu@nLQZNY=+kX|Pf(L;61KHzW3UrSDVc9 zxlD?=<|v*xf4Rr7dHKYf<9NNC9hpu|@q-*b?$?iBC7Dune{*t%>1rqM1DnJ0u`on- z(%Yg?DWDZW$(gzF(0c=4>XI^n)|nrtukbdGQ^qZutZ6X<#jzQY7@3^sJSumt z&g~~t+I`XVR3nB-wFf&I!~qFKdIX~Gh1wGBR%&MT_bl7SA!&w=;F=Z1=}Wti>(Ed< z{|ia3Q#bd0@c1YAGkYodEIH&<%!V16y>(Y2rhMXNTMN_PLNR)6DWVd;TsY>*d7{3l zKbWYqK_pKqZcW9n!D4(DYCXmwVTOBXNMq+68S6@^xR_dn&9|S6Azk`Gc>u$1OGxuW zWRq}&9y-b5AamGrLA*I`s=+Ic)9-|OXq(wUq4sw)od7LMzpiUv^kxhvqaI@Y*&08iOysnpwIAmLT)8nN6%B_5LzbS@NA0%j?ldYW~ll`x{{bB4vLIy#4Ihr-+#(Bv9OCg_08^21tv zVN_Bo7|ZBUo3fMLo|QT})k!-iINx@3g+D(wU63oYv%&uL(Qc^Z%~^$n?PzNJ=GZhg z=AN3?&8(fJ!5s7sEVohdV0J&+9gxh<&K^jconQ>;fpl~R?)?8_?@hqtysG>EpYCr+q{y-jQ9eL34b2Qr=^#*>=~%v1V3$*+Ws&KL>} za2&Ndlf-#W$Pt~_>2^!YEUSXBbYPO(1Y^w=#N7@%L9!6!O2{soag|9YV&=*jec~>G z)n7Q{8Hbv(i$J0cqi$eq(hBt1W~QS$z;P)i@5qGDfsi*#wb)bPM8kmqoskG;0~b+| zAUo4uzuDpfsxYfdc_);RTk|SHXA@kk82I^{T~8L~N~)QlgH7QmedA%Nk$imz|NqCs z=DscP|L60!KK}%tFWdida4h%|JpIeTajDW|2m7orO0l)tl@P44! zf71IuAN&=N-v1l$`yU4X1P+32;2f|O{2t!_wLt#=&jIq`_wV4-;KSfj@DT7rc>N!M z{{nvn4g%@?<JgxXIIOF5zv*g_B8NGCZ_>=-bHEO@;3g>+M!?1V{PQG+r9N4#e$(B=T~ z!{m3_4?oPj#@Ommz}j(Uj%56BwIR_#+CPw0y7-zz2+xUz135Z7947VZcCxoz@Yd1B z5msla&A#V6YW9dlb7*X<*C)IY6+wagC_QogOcenajB2x$>G617d~dY;*RvU;zLiy> zZDHBpzFsnOtZdnoE}4v-cwe=a z%50V$<}%9U7Y&kaX&C&kD37F`yr!7+HmIJ;{=|%R@iwfBM4{!b-qy!+t;^XY?jLI? zjz`9@6vppZxE-!Uc{kX4qR%`P^el+fOn~`)UP(`n&Wi&^Pjz5XLLG;K6P-A`F_C`9 zJa7z7-1h7@%nc=P9ITzeV2*z8(8hIkEYq zJVpPvVZS)E#~ifo;i7yTX8-zJx)Bd!z1W-VM607JL!B2|Np21h#;s;3vog zz6ss~-VL4wWHYb>JQ_S4$bR6V;5y_4&jD?4Ay^GoftBF5$OjYy;ML$vAbG(rkq^8V zTnXL-bccU4I14C-pzi0t8$1;}1w0vOd_Iq?V2s}n0(Szp0oU;?d~}R)rsx>ZpDtL7 zJSN1>v<_CR$NuluTP31sbk(R=v8D}8hOJt-`))7~ z=m+>bj9*tH3f-Ei=8eVuE!x{z?ysx9y>_}@?4F~x6mJ|(t*fSsvTI#$6nQqYg}?b( zvc>D^$Dle_A7?pkWyKN(bTL*;WJq)0sLUd;`@$d$q66wt=3kJ4Y(9o8wqLxu`rjQxB7%+sI`T z+%mfwYAI6dj1#XPje>eN)rfB+C^7S}L`gh~UAQ6?n0cw+w#XP(RgK8AIjq+js&V~* z)qXjRUQUgcTdyu>2(rc)netN7G9^)aF4bP{rPzwCo=xswcVTm>(2=PPmcI+12OA$r zgmWjSTch-IR7`tMNE(^+M#1u{H`_ZWKsM2tvD8@NdU2V#I_y}e2G%oWRmk+Wk7S7v zV3=-c7g+CYK5}+w)?H?zqv@UsHN(tb&=Aeu4XrpFOEi05R7dxkWcqj6y;<@gdC(hC z2$8oxh{kI^YTfI!nl98EU{RG(V<)Vo%MvoXlKJG2olO)1gc>huA(h$plH7Y?T&;)t z5}TQxi%WI#$!G>jG25EtyBpA)%$!elw%*KNylqJxdSQNj3X~hURC}^=VMX53&0M@Fws^uowIZxEHt+xC;JW_y1RcmEa-Z8u<8UfRn*pz->Xi_x}#~_$Pn|fct|_ zz{6h#R)ep>vwsSF5_|y60qOt04{!cGa4mQqko~{z1=PS&@Nl3${~NH1zx#mr`wsxI z;Tq|TA(mLQxW%SMNQcI+`<5tWh)LgiWYRG-gP5#XeVmO6`I!17WD*r5b*5x3)h33$ ziG8B3lLht5syySaT>pP3dB0^_Hz3x5@P;}aZ?~uX#E4da3LY}aS<>tffLHV}~*g4<2xG8@upFF;dly=Q(_HJ0PhD;rgC7plMF2}H!l&{D`vWlGK zmb4Tg8VU1gjot=(S~jU3Eh5cG$)=U2*6gI+#`A%WBV1ze7iX<{7zV@?eOw$;Z097e zU3QHJzY$}56=FT_J|d7=9enfSg?O@>wRGergyw`YwaiNNg{mvU*8j6cQj4BF)x(?l zCc}+4)my`dJ?rP$6bk2&jb5rq-!#i+&^oUk}SFi(#SE-@;ZXSh(&XZzlAX0e5S=nqmXm8H-B<@Gj?@VskY<%Q>gE;EDu zL(=pl=a*AoMRV%R?mFKYHqX58aY2`;e&(4Xw}2I$oY9#%e(R!+TVL01cI%>!ktP>k z)XCI3((AhHbS_)U;{WmNUHKS%r}+N?e(CeK@cJ(UiUD{Q_z`^m72sXq^+3M=H-q)y zXYl#*1Ngt-ZD0#H8~g_TUjF~&=U;dHZwrnBzk2m!ALaK)(Oxz{TLfAj$zI;r-79n}GZS{4f0fOTaL=H;_+&pM&oK#rS&$ z_%rY*&<7p@eh;626}TK60FME>8&Cyz1V3aPG!~x-yuF`%V!7<<|483~&L#aP3?Hl> za6`X?$TjVo#S>Oy68+ZOjm{<2mE5~swdVNTcY0j*JeU@)t4_A|<3m{Hj=A@g?;v-b z_vW5*YRAG(sh)VkNw_pqiK*?Ore&I=XmQWG)!DteKg3N^)=BZn*Q?SHrtukavE6;A zL(@T1ojrECer@$%XZo*Q{%bqG{B1LCqz)6!nTBj`YMo1P_v+n){iC(TD8PcLXxYJZ zWMy+|v_7_tnE9iK$Y|!6%1wOlIPAL~G{k&L;^m*q@~v>+TLm`<8BTmK$QG|Q$YtnM zw20TN{d@ELia(^L^79z|i^-?xx^C79C< zk4y4WaGr4SP-{xJ-$F1&qncCEMTnJ)_qM5VF8ZM-9cv>m=3qdSF>685qezF|pH{C= zX30M9`0Acn<^y-go5Oa4Jn1ph-J~5BIGS+mkez0>D@LtzOQjXgvxU)IoWR4fH=Q?$ z`#7Il@*l^Y&Vod-AOiPF?hJnIYs+>=ElMXW*_(+e>gqi67lZ?sFtVHFD~z4P?5s3R z-LH!zX(r@KSS4v&#-MkrBh3jM4K)c5AAAk*R>|Ndu};HimR<_7p~RXxiL@Cs1yoL3 zoL4>*!@DpB8BRl#xHM&DJDlVK8xu$4kKpont4wBBlG{LfX&@DUIkUX@oJu2gIybM& zDx=&eURT1>_YGu-^Grp&tN{hLKv;!`1|o*yNXl+z&GSm^i2<1ZN2dPOYY<4<(89JC zx;7@hA>HOnE#{QzGXLVn@2+e)J#+qwHEx@Jz3d2R3Rt%YORRc}>kINNFIN`W_TIH`tLbb%W(1Y$?1H{) z0g20@s1M?jIg}l=Z0|axMY*8zASFwKxdQv-jZDBE7EPHDKDTV5a&bBQq7u9X=acyV z7jc}O0PiXOe~@4L`~jZ-i{PE$CEy9*Y_JyG8~hf&|Eu6V;N9TwKm$A+$QI!5!G&Nc z_+NPbcY2%1{r?U=4W0`W&+l~ba3H?`e*<0& z&IXbNECu%lM}c3#`~MQiH^5WClfe#913!h||0wuZpjZI2;L%_S_#OQIZ^1XfIJgt| z4ZQxZ!B@eHz&5ZMoCR(Vz6TQ?RrRCMLY z&F;OZVql?$h&G3=rrNinZ<&Iy?W?F;#5l(5leX98FHpuzNPnQO(brEfWpeOSmH+&Y zgT8TE)uO3!-?~2J87Hn?=HIEPRNZ6+_Z{#?z#7D`q%C5=CW60Z2dkG7QNZL6N@M1K z9=3Ft?=o5^&#rpFk4uX?+JDNL)rI%N=FDX|_q|ZeU*cNi8~n6y(vHlvD{YR|hkmhM z+wEc&4T3!n{FrK%G^iwP@?9pqYEY6?k29%4BT5p+L+HY^s-bea(JQZa4f%JVrd)%q zsD}FF8p#w1gQu@FMbZ})?%gsGi(u~YkPk6F5>NRz9o;MrY-K6t6mm;0tDls?w`5v$e$>n=| zoldf*RP;^ulXONVW%8_&O{u?CuS`Py+m$BF z*J+wAKYf4bPYQ$Nn{6E1k_8kt`?jYJI7*N9_ixC`(qCecc78yjb>S z+z{yNvTRcKk%esBqh>K1rl2mPX61VoWvx$|JuTR6R)FN0S=`Ig?J>6wkN7s64r%C^ zDo%^d531L+%89#ScDj0QNB3U%XS*h}|95G17Y$r*w+i@1|7_B(0C`o>+qs%|KS}8U zCC*jkK%@WUgML?J+qSJ8!U*YJgnY5|pR#%l8XdN?M*j&1Sv1iHt^`ceW38ChHW}LJ)Hx!f>%>bKo(op52`g-$6eX6rrcQv9v4zCi^q=qaN^W*!I#~kr`}zU2Wu`~ z_fViEF?&dRFL83Pa5bZI)vNO? zRTX<0e%NzUdKSJ&PNY;tnR`+1&#&}UwrD-qt}?CXs*J}Komgp>M?&jaRiB=%A6vsQ zoK;NtTW8g$^ z0$2t<3U9v`JRE!zUj7^4vp_okmw-KB7gz%-;Ct}&y668fup68N6yI+KJO2%gWbkEp_Ai0YfG2^61I6|GDA2wAr-R1<#Q=B&xEuIqc=pS{S>Q~t2`m9e zgJ00cUjXXstH7JUYru0gg*A_KB_F3nS}Ah3gCh+%0<&pdXX>-%JtwRbvy&c#E0a1+ z)~$95r%;08JNXwL{%P1%cgszC6szM4wc%IO<{IO;qaB*7(bXz1ti+7=MMs&uGMV+` z*4D(Ct(nb4ybgi_KRkl$`VjB4EUC6lsoESjpK`Wy?3#1KRwZ9# zxg_ZgkG9Yn8XLh8YbwZ9+irTnsp@Htfa8#3e9)_2BP0J(J4eUcgSdDUhh%o+ex*1g zo!7RUSAvJIEJl?jGdR(yCQ;mb6n@Rjas^_yX=(V8xHS$-r|*KvoB+%QS)HDvutxeq zAWcrXvElwxSFhsG!)(4I+0X?4uc;q5a=<2wQsjW`Yznvw~q@Ox{=X$VRdD-CYzpM+LMyKUYL&FP>^!0#HaE160N^x_4ExZ z>>!m1#Sq36x|2@q2ey?7gejz$7{(2;b@=GiEbNlUI#>Ox>$h)q_Qw;I6tQP!W~$S_ za^)z_#AXNUxRP4AW7~#_mHxs?{wL(%%AvJuR}URG)NBn84xha0xWVR0YffsNI5Ipk z(p)>Z_JrZ%Ph8tvy|#5?u7H+VcvRQVu)Pk>&UhTp)V?V;GusJXYJGtw)y$j((bLqj zYITW6A*haxPU5zD_f(rGF@xiCgtq30S4@MHtX63R-$;w^_COZ~zs~PmJHm+9bpzwf zDFr-1!#?lW242wjS(9tEC+mC0ro3dWqba9q+Cw2?hpLvF$bFL*nq#Qd(d3n9uj$!I z+2)7YN)hg{jx3f{*R6{l>vEZlhyv~pe96HqN0MPXRJ1;Eoq5OWeu;KEiLz~udPR(n zb_dpU#dIf?A%&3+qqlxyc<02}8PUFn@IHlcLSi_ru;bk$q`Ar8$m2c^|LG{Ya}?xbeq<9f9)48zkf zUl#ud2j$!&B2+EWxp02oscMZfjW~1&89#j5jW87XAwY8ty>cjh5xb(KT@&r$S)!n7 zijBx$9E;f zdNn_FRC?KE=6NQLPUGqV)5g9JbZqyeIp;B3w^*+>HrmXir?=RY%lC4oO`C06R8(($ z@{7Rs3r`$JX5JDi_I~&gmqB=U(dpvp4tu(UVv4)eywhdj9X?Crsy(DW^xJTWoubYQ z5ol3j?6c!m%g)7>w!TC&$CvIy(v@S=8S+o$p)o&s+O}@||EchAAAk>){vW-NeWd%p z61*C`0?dFRARB;V!0o|x@co|!k_mhgd;+``>;(hhA>dm0|F;791C&32dx3j`k01kh z26#H?2iL>*e+gU*6#Gy9|4spGz%StYuLhTZRp1`rQ}F*U13SQ7!6)GLb>DwKsDdBB z=YJAB9h?bH0}le9hu{Ascsh6*_;Vn?evbqX1V;hs`L6&k27e760s4Vt0-r(#@Ja9q zAlv^P;L+gr;CA3TWB}g>9|5ldZ7>P8fIEQ?A_tIspdZMGpZfiq;M3q=f#L$#C%aE9 zDTKBV`_jRx#=?N~oKTmbA*V%VAql|#6bs`hm&~(xMv$AkNfKb*ZKmZ+DO?XvwyLKi zp4eM_VYBm*(=AOj&TUge$T*x6LbEXnWbO8M{HKzyduWhTg2C=eli4!$2%RH#gEMNa zQlsja(GqhKVy-i>m5svEAYW9ytSYk;vdM*_h)NilDfe0(FB8O40-0n&A4L3HAoh0Kk6jW2oq@1s?!xpC*IPl@-KmDu zp_XZ)P;#`K?kXC2fjsH1FmI#%{ne>Cg11b%^k$a^32Sg0z9bGZ%{3R`GQOa@n=k|( znwivDzqQk|Ls+86$m^U9J3XFc)^s^*w(KP_9W@~4(yQQ3%w(7G?H-tG2{Fs1RP_e; zIz}g(s$Zi4S)BY=D}SNEi8#I(4d|Vg1R)B-G@mL_K2{49h&&gY7|;BV5A7u~VMZRu zw@`&phOZ3@XGyP8azL|!;g8XL;e4zixEF>

K7k@yU|iBZJ*GK5N>3hh z%wFybk9J(@kSB2$&MrtVlR+xY5V9uY89hm>+c06Aii8lI*B)AFA|?;JNvGR1_by;X zg^{MUlJRB*Y?-mLc*;3j-m_aR!i^?2rEFmXne+n{>%s72qu=Q<&kQzH>Gq@)7V@Ew zo0>6%CY9D{U?ESd8;(7eO%B5sH@S`Z;%3$M3%wVUZnr1PQIuk{ymM%JYzhT~Hf=eOT?=s}8jN)@oq{ zk&Tn~@4^{iJ=7lR>Z7teY{dPxf3cfwpHn6~Y^;l=2XdFbw><2omG2>ijAVvDE*m`% zu1_5L8K)yjr>1X&MVof388KA*yfWUmH0?(0WXM+E&Ei38&GSW3+Sy4zS^r9=rwe(U zI5tS$O+EJ#R#|on%DYMO#BhX%N~Jv&*7JfrRWb`RhVvGOTsM7m5AL!xvmIUG@=;lC z!IfoCPdoxSrcs~>yC8-wiZ)z5n%U#E(VDZdrLDpo|!ffy9VBXMDRbOA!9>jgk zNOi6~%j6=o>)4bC0)?VefM|QGO|1uKM{PBVX7C-LwC-N*T%S;J+N1mx){cqV8B4sy z2gZetBm}EAjDhIzXW!T*^{Kf5bur=N2-3vCRgJ_!XxF(Ap272k%Op)}6BbkE-jn6R z!GJ0(x?0anx8>e%);H#S-TL3BN!YJ#1R)<6nBuh zAujAPMPz`(exOlNzi68Zb?d2-UUr%Z2W9AY&BqzV>)CJIKvc`yWpNF>lgM|slM6p^%3I<)U zjZD5QH75IQ4+Nt0|2rVSJ|eyo{{K9F>GNIq`!|9Y0Qvrt?*A(I`Bwnh`d;a?TZs4xqC?FpGCE(9N6PyV)fiuAE!H?kK{|&qmJPrIgkl%jk{a1jyfgjMf z9|hv?)z5({^<65N2->dB%DdU`IbljbkJR^bo3vF6J7+C!lED_xmS&rCs5U~xb+>Kq z+uyfLOx{Q;{fuw15HmZc%{WenU)r`TJ8*n)!!{*hQpUPIzX@CK>>GvZi0`k{ z%i@YHW0qz$seR;hZ}R$wf*Xg~0M6jOYQ7UHEj~vnM#^eF?h%6SQF{E3KhwfkbhHe zXFp4lJt0b$B&ri+d$vR8%ry7nFCqKl(l~3E z`crha3~Hd1pr%Ke?Kr%Y;46+%d?S}nHc$QS`2zK_LS5CF9U8)_ zIG5p4Y=M_9PqQsn40rRb{g7!@2D$2*XqHdCx>RDErAR4qxuyB|UrS;Wxs+;=#$9^c zk#WsnYp5x?Z6<&+Bq>LNShYeeyYtm{BXwM~N?_pTMBo}{Jv$XUamL+!#`l-TomR=z zY;Uf_zIr(w53hq2v5BPG-NLdBBxcO9>^H53cxIw9AP5*ulo<*ZgE7WxGeL7D`*Kn> zs|0nMuWHTC-mDbZcro>DZxtK#t(qN{zL&s+o6eF^#&}$UZkq@(y43p$ZhG7JF2_mzFf?f+k_0b3@x3Xp~^)AzBh6*Lw z-z;cHKA2UqDOz+zk&jWzy2+7m}+@5iJQ* z-&C?Gd-+>yZVVENT4esuP*7I`O^Io&hEP(+S3$v<+=mbWcoNTg=)0-Vc12PLLO<-=L*48}v!jp13} z>O!2sTz<)7HHTI42)5UfucGsy30r_tILv7q$m->CdFt)%3NB%PW6+^4f<6xWHg+oS zVkg03{g+hNV^)}#ANWSi&SBFkGojkka%PlNz$dRM7U!S5Aw*uFM=Wukst5z0v_w-| zU73|$qrFAPqY?h@XUfBA(+d~9vc9r%u>5ywNxu5{8_D? zx_ayBsr@|IdO0`x<;{*8iXM|IY)@1Eoe{JwnueGq&ATn*j`BoEMifK}iw;LhNGkOzDg{0I09(7gcp z1Gos>8(e`r;9cNs@F*Y~fbSp=cpumUR)haS9`J0i3)~hQ1+GUf@DA|L;ML$Xa2)s} z@OtC~JA%C6&isBq@`3k(e*u?*)j)EC&m$N39JmI2987=)r~t_c-UZ$X-U0p`MBV=e z{{Gfc^bOD*|5pRa4K4x4fS(`(P+Y(7fR}(LfGMyCjDpjE{Qo@|+!feZP;BO-xd7+w zEP&1(KH_`A9M(NO4>ggf;2K+76Wa+(+H1n`;A!uXS^;B2+`8G*9!`0f&MabW4EY-t zUb@k!sYsuz#(%}f*kzAk=Ej%()ldcL$ttRc7%BTh|Ab}5$pWo{c))f$6T9l=bZgOn z!OM*u%HAyUtTwCR?{L6t6K98;z;-b7i$tC}^sUegi)$rRw@uQ85K%hZIAA8=6htH7 z6+f^K){KyqS4h_6q0kMk(XH-jkIVl#uZ5jbS_)^6RgBS}bZLiZRJqJ6zUSq0tHmx0 zxpyHK+?1*Fe_`^E1rxiVE56 z5~0wgnOx-mtT`@MfuabYt~3ZMW%xQ~iWSUMZ5>9xOHO*Vrl%F5(z4d@Zosg|EcCMz z->4vON2Wkum#QxnWp%apq4NpD?xTkE#i)$Tz$dL{AIS}WMx&7KCfauD>D$QCinGIJ z?fTf2J}nv}lYaS{Khkp-O-2jjT9J)Bca~|!1fAA+^=(LEb(}9ve1BbaNF+d~K7Ld@W2KFP)6p26AHF*~!2mbAh5YEoHeC5tbf;FoL|~$ynv= z&B)D*8aGKY4U&0~6B##R;|dix)YjSSW*bn6-4%b^o*377!EsznQ<%>Zr_;OJndM>% zl!HbI3#l0{K(Te{$w0l+=JKF~>p{MdWfC)#g$miF81%{J7-)e|X;v3|q~)1Y^dWef^}{`EJr!kK zM%Cqa0+T-ke->;=Hdn*5?DVxqkp#4pgTuTa9im%8rGjl`Tr1jRoKVskuzh@XbX0oi z^>K`Ag|i{j25Lc&5*ChzioP<^ImtsQVg>bW;SKh~q-!hszNBlC1d38B?~RFCFMoCg zN?6@+-gI+HrXq3_#LPphf+yLW`-LRaj=4_te#kdCwl}o3PW^=BrCP~l0zC@&#T2efv!+QaqAdtkuEYG7dd`Mb95*l?EhTVwr} zE!#FUHf%d*)4)JoPCz8v5XHW)IW}$<4xtschAan-)sEobT6?_89&U9obsJ3jz{#+r zMzJeIrioe`oU3TgFdQj~(g+1Pg-gB*52aeS!DZv?@#n<|9bE(@H!wlz@LDR!`nX_JPSM- zJPDix?gs7(6bJBH@M56Yf0u$ua6I?{JpRYQ$H42s6xa&3faAe8;ragqd## z;B4?4c>iyL{{$ZgmxDHVJa`mX16G4oK)wM#gFN7Q;JM%_-~{l;;N8dt#=(i;1R&dj zyMtdN7kDjr1(1zF6YK<=z#oCnARD+E{5{wSc7W5s3Lra!dw|=6pCcpq0(c!z?7)Ly z3hV|afct{`fZGGb5BLmt2Y4;`EASU!E6_cHv%zVg3M$}hjGcT3J|8?0>;>{apfMHs znwsB)Wv|^r8(E(;hfEl_=Gj@2@ zV;`dTj@Q8gUB-40zJ!j9`Zqm{|5SW;9_yGmC9-4IxsCDWC3E&T7!axLUAN{k-vxEC zUFpv}F1e$*JXAc^wia6Ar-?By@vDt?LD18O!)6xUHP(85Be8+LePXVmcQT2RZ#Psz zI(s)3gXJP(dLp+GKhNYrAyMjZJLF{vV`l3o=axsI{PM&+eR=iVN%HU)xMn$pT0Xv! z(A=>Wfd}Zs{~`L~pKhcO|8v49`Oif^<70#AlW4Mu-K7o;<4UGD5anSCfh*^Lb0wyH z2x7c2D^icSRb4!2O>iVR*``C&j()Uhb5nCJ3fRxBZ}so6ZWgC34Ko(jF%8J!1`UA| z*e)SjZbF!Bt3EcIvUC0d2Y5JNTH$ouX>84|_F{Q(%bB&1xp+P|)*5$hQE~y*%dj|$ zv6RI#POIoM&WYq8Fz`lkh~}iD+VQHo3NUYwT+cMyC6IYFTWRzne{VJM>39&tWyGD@ zLoHo3d?@+QFFuN^2rMhqgq&~O)OQ`v)t&p$N;u_QlNiYMkg-@dy>M2`XT7mm>ybAqCD{{xCv)jyvsfTR??)nfhRXC+l_?-k z$Ojv*Xnm%ww!4~|K+yb9M#%K1@w%ZVR6O_84zgBcAmh{sIulJtcfXZxWoczxl8bLL zbnc2eYAL;1&~7%V<0y)WNHeoiB_4-)41A?(Rwz6y<&3hmSagyCevT63mQcFb>@MmIKtFySIeT~+Zoqr3^R4ilIV$&4s%pb9AObj{IZ z(I_;ItT8n&ZTHC82%tSH+P7t-5X%Sko5NL?DMsHV*1jOd&1%+$Ul0C8A%84-Y0XrS(=kk;@<83RPC z##mpnblJGtP)a1Y`no%RXh5>7h5jyE!-dVTRKz#k-0yX+xyA`WA>HBUkF_;JI(n}~ zM?2|lj+E+9m-QLR+`5?inUY_6hN({(OEQqj)%m{x*$EG@DGsm$g*{HMYnS6G6nY_x z&6DUY(W8k~HKx}KVjxFnMSpS{x#BDHB7uWo*>>@UdFOF<4E9PE)0s=yo|En%%Y~^b ztWU^akqXpB=5i8Il>V;hd-5sCBhye%+o|vLE-uMAlQqV`KDeHE#t5{QX7DzXFBW2htqGAydJ+B)J> z>Lv7C)-J-wL-UUT&mHYaZvIwX^hcKTp~T6GY7{>SQIxHst?K#e6P}0qg#Q+*iYM`^ zXx3I@*;>DG*X_C+V3tc^i;Ufo*+6^R(0s#TM^lMr>1%hB>+a25(%rb*(?>xtBu_yQ z`lEaNrKnaWffhA>yF`7-s>%p4jiN1f5%A4HLKnukg=S1^d4wIA9Fpd^!EJREjgCyN z%+siu<%ikf_9p2@ygg{No3XwjClRcNxfxpyVGH#dEFb|uIBQ!#_x&_qj|urYJ1JK< zjR~X@7Y9bYi_#al$KRi@i6v5ELzfYg#=#T1#+4{i#~sMi7qjU2Q}3Ks#Qn? zQ-!w>);n#0MVlE_#bR)55oc~+LU_e5yJkXe>XlT-reQwO4XRmLep3FrF?*i`JQG29 z85K~4PW6Qona-GKKD&|ly_(Q%K@q<#RZhn*RUajgqlEZO&Pf{?N4zccg*HVK9UrP7 z{9Y8F2EkkMxEKsJ`NBy1h6wA{aPlxsEk0XOYLl-#eb?jkjpRToywdgD)uM`L)3Cjf z=EG&UUNgSyX|k%*d6Ra1F^_JvD_y=UIU5Ky$#;rZCG+({4Jx!n{QsFuKKY*$|9?SX z82>O@_JLWj9aO=+!FBNU-v`eD>%sTn*4xR`m!FfPD{!u_Y{&T=6xEHu5_!4~iwcv~38Q?-7Ujgz5ct`Lb z@aP`~Z7>QR19T_gou zuoiqBUj1v}1z-|PfX9OKfP4bJA3l8)$Y;RM;K|<$T3{pi8U3!l-^JfM!5QFJ{Qqt6 zU*IFR(dn+8pUYyi6N4~XNR)z6nU7`FFUAo5uE&HGWwrtdVV~zyKfr`po`wQiZg*5n zdvdUTAaFX%wH%fwGi$kV?+5z^1|px7+&G4|^pzRNHz!D~y!cIs@OwncO?xHgp6b?3 zAwQ5Y4dT{bAdsy-LGON~J0R($?jA_Kx^cQ7WG_lN+yoP;AFgi(IZc=~A*qbI)A`_f zKRA#ZI5d{!2D@jh>m-O8p9QsZrE&oJER7HV zXIC9UFQdz|n5tv2-kU{t#R^yXK^B*o(urLR=6m((S?&FDBZbsPO5Hi+e&bUNF`lP% zpR;MMOikmQAJ8XA+0uxS~VI163%dKHY_AnPpWFQVdLT`(UiOf=~is zCrB1Tye>pWZ4sI7{=zt4`o!;xs=u(?9LH|fE)I#0#U&Nq_~jbI3iR3Lj=sqo>+?1S zR$Pr7HtB!LhKHRM^Ful#aj>DY9m$%pW|{W7ouJEj!mKvIesS(Vrfs!>*c5D~aEN{pkZ~yN{4*~)uW;x_?99HdjV^<9(S1$EUBdruBZbF z8w+N2g~<`?6AdGl`FncTrLm7`j?V^h>_}e}>c+QlY{$2RvWpK6rekNr*4mpfVF=hEwdmzj<0g>MjaLQ(Ob&8*gP5h zkBr;Rcc1bZ>ts<~aVl{Fa!*0JTX}S2@AF_bLrv_?5X||qNOwOCY5d%ra?m&cwb1|7 z9a{G`QPRm1R!c7nKeN{2Te_;I9VOY>y)Tds2iGK0fdo<~I=2pesK?rQoa#^>y0E4- zipNEB(qlRbQp@Ed1ctg))Jp!vSn*_3;u#UDh`H^|7^es&#s2U*Dy#TJG>)AdLJJmmAI{L%|^rpFY}T^+HB8oPle2Y=mu14qm<68>kcqfaZpy6af>9P z;RsG;qPaRGu9(ddL{=}^Q@NKGNvT&*4_4G0Icm-f-}07PH-g?q{Qubu$Q$7`#sA9} zpg!{b_Z+Ym{0?6Jm7onK!E$gccp&&w@Kt#GSAtPc1OE+A|6}kK@L%9f;4*L;xGgve zyc@p$jo=Mn0GtSZ4PXB{a4t~1zpuf^e-eBG%z_7lYvJEzyDxqGCU83-9)3SK9vla* zg=d#v|Kot-``s1113rBK)PZdCmw;>G&1H+f8Jq?B!SzS+9DEUc0_+3p!O375sDa|=Kf`1A9y5K1LV{Hj^H!&r~35e;A!gH(obaB~XOEg>Mom;Q1rXpA2jm-UDE6hT3 ze1CIJ40yPYinZD$t?9P!^vd8oxqa=Yd;15%b)bH2fthPer|AC`K9DQ8*n=nG&Zz@T z3;DtEaaKtXITdA?9{2XI9;N2xX}LGTMxvtPhw(Uci|7K+txY>HNbJuyhk_xS)iAWj?)2^9f_?1qaO$Vdc{p^2IQ`{UhjhxV080}w(m(-k1XjTCiTQawxQIH5@D8hG_RW218FB#PLEP1IC4 z%p;A&ZW9tm+e%S;W6kOr=R8K8Y*|JUI}8FNZT_g`te*>Vnh;rOtm?GF)cc1nfz-Qp z13O*;>RsTP3O>^cq!p$&ZEhTr{4c3ZnopOvo!HBhen%XrSpO#I#)gKmOe(K7nm5HQ0WC<)dXA;}yklUC+5t<>W8*p@Bz_z;d zVrxR8A#wKm_(;@@+GNv4dPW`+d?Kmhf*p(9L`LC`>tRey^_EBF39hhAWbe}*GuMhu zOY06D8k=AhegCnMVYbBhYGNX7Xwn&Ed!xqmA?fqfNs*h6Dn2Il3WwQdeid3r$7klcu1r`te8DSqWU7;PPEO04=?Z^-WE6rH2;(e_`NLuZQpd z5%>VO8e9b~0@C~c6rTSZ;7ver01g7#|E~bo!2f?190bP#-TVJN_$hcJ*be>_JOJDU zTm|1RyZ+Yz>HTHfzY^4e{QkWPyb^2${|`I}$Ob@n{dLd3ADj%11z$xLpg4ev6(AXb zbpGdpU0@mb9I^q~?LQ3+fCktM&I0!UA4g8`T%ddYYr$`j5xgI~9Q-XfA3PRp0%w4G zfo~x*cnyfL10KlVcLjF=S0XF;OCZ_71Hk>kSC9?-7kDig0V~1J>3jA6%fK(dH^G-~ zW3wz>ZjivDZqkJd%Pir0y5J-um}Y;@`ybJ{d{#cJK->itcTIV+4(;eqD8^Aty5-%& zEJIy%mN_)tbij;N^jbs2dHsML6wBGbmy_pmJExX6_l;&P1Ty7?^TJrDNT>E(svo00 z6l+~r-NY*8VlwjE1&+ed73ulv@3L0~+D3hN?&Nf9l!G;Dsl6veStY$uIMsWjt9*nu z(V4N-+^FU1uw$VbSkIJIA=BUU6Iot(+67g8jv$R;-F28mhP@kFaX3_lz27^TVaX4T z&VVcEA~e$Fdjn3(%yF6Um)Yad>T0e$WDXaPg;4oL&2}_F7j-5(OOmGcGhS+9 zK+|(^nd2>JV4~6@P}O#@CL?_VnvQ9k0BH?ic* zbCTGEEe|t_UF;Su@HEcDm^XK7am$i-67`Yc>Kh<%nIc5GWffj!#$YEg%P@+>MC#t# z*3+7?{R5XdDWbD1_|o@TgCje*h~Q_oUe#={TyWyp#vxZ{s?C|}{H&8IZc)R;F&5YB z6W)k-0)uT^xtgycrJl_^>?<(p%}rq!E-#?d%xFVHbxH{>%YS|9O6k@Kp3thd^a(Bs z&UX7$He)o`K%c&G*^tj}to6VNjV> zrSCV+O)=oCY_zdrlS)hyvwI1%!OU7)un1gnOpH?#!-8z$!X`2Uf>|GyVL|4I;j06vw!F97lZa1_w} z|7*a9zz4yLzzo<0P5}1+UxNSt6nG7|9L#`oz$xGn;An7nARmF(APd+B&If0LC?mK( ze}4yAz;nTKz-n+E{Qoz=--FkJ*MN(`abOM5y?{4@%fTd=0OQ~x;N8dpt^k*SIiULi z4+o!s|9=s9A$Te{2HXkU5y%JNTfj@f^T88A75o_f{|n$hz$?Kkz~6z#fOse1wa5V^ zA9yKvH266(fUCh(;H}_Xa6C8;JPgRL;KSfwz#G6h;6dQ_Kx6dP+f>R4s~pGJ)D$

RT;ZgHQsL-{zANl7^en&;?E`_96d?3 zywlI|+pNr-x+$1sLf2=*S1hZjTYf&}sra#*ONsg*c(>d};Twc&^&3Ea9vs7&6v;9- z&P7t6P*x+59YjAFuBgYDZ%d4T-gP1SHgS!x9tE0jnc3cO6v%y_gubJ--D!QlWWJl8 zZTXd$YW0)q+`P+iN!lK-NajovM~6C$1$ky{nnM+Kbcu~urzm6^ZcXd0J_GDngaRA9 zGN1Wr3keW7kg%b5i64`(P{>8W&Wnb-Op?6JKBX`c6Z@M)p~**+2+U2J(EnV5J6+BO z&f%Pd_J3t+nRPjKs=EGPW)zp-kdAET`yO=DXMGPIupn@l{eLxL+^3L3W&a-y^8c@p z^ZyAf0>%4#6xa&Z1Ihni0)Gh<^G|mFG4KFzNAN3T|9=6B`~QA032qOL16L#e{{?tH zcqTXl$iDv;FbvKDo52R~ZRGz?fsX;j_iuq|pnL!B3jPcE|ARnw{^tPM`S$>w2e=Q= zeShCaC-715Ch&C70Ox^+0mTB8zTjWMC%`{~d2kw#uHapu4K{*1gO8#Qcm{YfcoLWe z7lJ#08_)~re!qFp0=vPP;JfGp6vzMT;7afm@K|sfP%OYVg1-X|upMj!4+HlHcL#R^ z8k@HREh&9urfAO(`?07RZh0IBPyUoB@5%Z}$huBmjYb*0of;7JmyD4qYj$E*dWh0Ua`K_f`BKTIvlGpj)@Dw0+#5RW;k?slt#Un_ zYr=pET}P=Qd|fVlOxxqy#%MUYLFS-ytyK9GiVv};@;TD^74du^94Ff3or z^Voat*{`FMOO}*BKIg)a-Y+!Rd7Vs7I;ELwZ){+iMXuTXWRMG_pi1oYb4g6{yYylEGVj9mCnc||rh05WSa_baAZ>2$J-yqz2g#KVJo(|)f$L8?mK}H| zm|2W5g?Dii!_|e(7TBVitWua%=m&J}X9UEigt0NpFaRQozBU0=A5eY5y zxc`N>?kIOH_#dev>DF@+gsRzXYurJ=vHNB?Ij-hp*n@IqSobHCTQeTu0zKHmz9!mK z&Q?n+Yid(ewb@s~u2WK0@pDA-)eJY7&q;38T9XDhaAu*uZ|-YIcXrqJ5y9pEWA9Di zX!rnr{R@PEVN+Im~9!jC?l>hfR=iYal8ENb|3IE@R`S`J` z9LL`fXiK5z2!X+RRT4!6x+ypoc_nwX8Z$`pczo}P%t0qi+2~rNEN(<&Qdj3G{_`v+ zb;HPR(Vr}mN8>us+~6kHu7=U2Si2bI(I5tpu{R|{j%-!hR~;Z(0d ztf8qkEHXM7i)i}LjZ@3hl)1jp-Li_x{=Xdp!mr%q|3?M+|NY4M?*q>RlR$9*Zv$=( zB>TS=JQs|C)4&F>0=y9!|6w5d_?P)zJ^+6Xo(?4Q%fG(@R)Sjq#RU8e^1kl)dpvj~ zxDZGW@O9+;=YwZ}r-P^i*u?Mm1HVMZe;0Tscn4Sj@(Z{Xcn|XZUxF6`#RE74d>7gN zE#Qsd&%oos4loS9jg0?~;2*%1;3?o@a8Ga#@Ezp*H-Sq)9qa=41;0Vge+PI2Q0#yC z3Va3m{>$J^;Emwr;343v$n!4+j|3Njhk*(>7HAC4=I2@96z~K7{wz=|!FPi6qq(Y2 zUc_iwlo!P+4dDT1!^HsuBy7oq7X%l!6%VJ^9__3uZ`csS>E%irmt-*#1fN4$kwJ+p{%KM0~?N&cj?hU;%ckb18IZfi()v=)obaT5=f1X1S5e!l4mG9qlyn zkh$=SSlpfR&h49~;lA0>2z3FiJB6fv-7RYP5JvL7`c9Gb)g4w}= zdNlUXpfLPhjoN1Ko!g$Y$wl1xNSToaXBJxrrl+d< z?+lA?NR;qRgaPC-v7Pfa9e2d35s9&+O6Zf2TCYsb7VFeJeQ`^MRZev-P9;#Ok-bWf zVZpMjbsxgf8J+SHWMj`gUT+VgIrsc+Tdh}XXBg$t(_4+?^1YmC(-tF(lC_!t@{_>U z3r`$Jrf&&J7RoA;OL%u#a`ALWOfI3A;wsgbTv&cZy3WU0c))h6x^HsEdnwDVsWHEo z0d5i*VAiK-X8V$r4RLC6Hfs=MBkL;)bYGw~AnVr6^ABAIGaH~3mwuXOnNx|LUHM=}c|LOL@#_lj2XUB~tnQIe%wOJ6ZoRN4X-DSTm2SLN z5&Fe?ZBJ19Fj)3|kik^5q(ME>CeJeI%`2-rTYa5L6@~lmgy9KkODbNUL?;z`ND&{X zDbbco_MHE6jXdf$igfd=3?;EhV>B93)biw$rG)xG-eXQ#7b?DgsF4?2Ze+e~bQYPPZrB=N%3H4`Jnn25$WZ|1!O18>kLRj5!(lS#& zuXc}VSe&wYC#iUk^vSc#gw#7~LyzS8x{xx|w@2deBs+aAYqZ#tmy*6Xg_+1HsiYHI z^8XeD%c~_{V*h_c5Hi1yoc|H=q`Xi0@oqm{|$HoI2rs2xF@&= z_#|@u9O!^aptJn%My787#r&85|9!y!Ajj)W|EGY?^lt@s1&aUwCGbTc8UJs=pMe?h zFt7sL0(>92{yX5^;EmwY$G-$z0PX@5>rZ`s zGoF2vzaI_Ofwf==e41zP1sY3z*fmD?cgmHmgvFn2Ozjz(S>$#qv$8>5Ll+NwvOyB2 z+U8PxNk7kUg|)dTnRz;#hzV&d+Q|?{R%4`Did`fYP;znIFhTHI+;5dBJ0o}c><iYO<|D78|a)W zSFB&#>d=SszlaeQ)H~?Ep@YR43w?~BE)Vr;FlbsI1H@zThvT9)-nshL45c}Tx z`jYlu&W(+i7u)j?7g4|`XCO&MT%JXGKVWZRsA%q;_wxa}F~sb!sok^gUhQ0;P;uI$ z{1v+#LhTUVIYg%qvIsIHk$la0FGlG(FnL*ZX0a|S30D#kze~5$N0oSJ*LesnAw)&? z1d$DVwH6Z-3Q)NqOyi6ZL%V0lZP*tv>l+L9ZvF3M!dselHiA%fq^2Gu?oqqFzqjPV zUAJu!QWj1DHhqb$q$@SdFE{R;VLi|cP`_yAgt`sXNH06hgmZPF-$jpe3gxsweD2J$ zBO>CuKO7{PvXO3Dwk9IIpXv7#<$5630z%xkN6n4M8{;{YGGWNHtcLkEQjR-AkCn+T zvkopt$2DdrsQPio?I(E1M5iP(hK{N-43Rd%LQ{bSPzfnZJBs=Y)updPsh>%BJFJnl z(V3a_nTU>3XM;#e{>7^6NLu<<0Lak{!@0ZnI*cQ?4huc`f|U%1V{L zl7&QzuLrvECsk8MN)s)zHgXh67IM+sg@Xg>g{dKJ!r|-vtn%!zoVYDe1+4W*U1i4m zcv5qzw?H?rmQ>F5^M{sYnyo{&`Pc}la@8AB400lPbaI@l<9uaOdHQJIXND5$UY&Vk z81_|9yUsRcdXDrw&%{)FFR>(ztFg^XSAAv`X=+nG$_`iKY4OifaAKZKuB#tVr#3f_ zjbIN(QC&qkmt4*pRmB&za(cCr^*I0KGpLclnqmDybZfQIaUw}c`u*DGpgC4l%b3F3>|=J(UQ*U zzouMSxAyoGgO^acpeMz}f|O$^ej9(T@Q}5qZ(41RNY>j}9(00U8{RAo!2wws%`Det zMN|FD8+UABP;tQZL_LbwH#axa8CkPtf_-~_ZrWax@#OW5b?cia=L!gB!;P2&C*dFc7Sp_~ zK}}71n@;djt4lN~*kaHCcAKka+T2dFcWQAp=A&Nb$HXF}V)IrZtB4PNl`${r@AwHL zUWJs#&YICJED{I$jV&w366OQVqZ#+B!xTDV_J7IzZ%5{o{2%ZCQ{2Bfa2_}oNap_# z_#03I=YeCv9l-6u50Lji46X#v1pB~7Ap8Hl!TXW@-v{0c&Ijj#-CziO4jsU?;LpIb zK?lr&?cg}@b94be13C-vTp<0xt-$+`{htfY0mp-%BHMovC>DVH{BH}si){ZL@L}*Y z@F<{o|35*t{}=FTuov6{{1bBh6<{?O28si49Wwnj;GIA={=WdaEAZFI@1Fv11 z!G9yazXt3A^4nLRJ{4S-+kg{(MJw0YL{ix(hfg2t0TMRYK*J^%>0@P3gxWnfE71sG z50_t;U!)lsQwCnbIsU9Fc(a6cXWtbS)7Owv%(2(c6~;ZI1t14qMXE@bpAs`p-&vOqxE>X7gt8T@x>o)k^QjDAl zbB+#T?plfEaEQwhDz_fL~ocmP8O91DEX$iV}q+NqRHK)tC8$ko!scm%?MAMvbbYfiZ6u95 z2iIU3gOz*5PWN(XN;Z7O=r2XCKYhvJ)JFOe}Cc)_s0e5wZy6EO}!WfK50qpldM8R ztgVSE=v7!`4sh36mQFMm=JK?aEMVQ8UCcOXk7^V5H}~>NsK*8Xl38iTvQ}!QgkQ^* z)40Mb&+j^Uc8`n2TFO-@vk|T2V-<=D%O`8SCpI6Gjod`KnBMqO|uTS5ZHy8 zVfoZWS*ScFIohMZ`rd~{LSS9=EB~N(E*I6ETQ-f6ipw@`#TT}?;Aks_R@E$(>ebi| zTMptMxkjx>>!~WbSE+6OmMIj-O4PN#7L@)g>^h6schPM%?qt-s}yJl*cxT4##o^E^>=ch-Yg-$ulpio4b zD3l!igS$2^6a>2Xb+t{zk&&`nbzEJuTZ5#zqv69PL!E~+G(3Y?E;(_qhe!=>s!sJ| z_L|jM6wrjMsd`>0Ja+l5mdVymhi6@$y(D&XWn>Y@3enC?Q5fGIK$?~-bZ^CCwZ=4? zXf;&7S`BF{`K!|Vg_^D@>=v`~{+-^?gsbk&VMN|Sdh)sErAYa-8^e_r^HlK&$B|KV zuIi~XOm2VA=nkELmemesit>$y9$4&D`C9YDb$nhe-7Gju2f6OnW@o7QNO^lWG_r>z z^3TbnNfd;A()8E)T2EJk;+GW{y0b$lON>h;>C=L0*_YkCc)N6VK}WlEhRC&lJB=`-qG>IDkuDvXu_{+sd$X3k@;S18HbgoW_m3G> z&!n%HRCxTHaU|x|aPewcrYvVPOEEwFBnqrsfw5!8q;5PK=4|fl=)PT}vy(H_S*kQN zsVoUvvc=GvHON*vGu)c|Ut`>V73BX0zw7gMWd1t16Sx}rei57qeuFIkL-0QED)1~Y z4~_>vM3%n}dSSF!!+Bp?M zA7!<Jyt@Pwfh3u|l=+mboic^v!YQ znYmdOWY)kM+HEa`wx#bfc|u>=FRLD0QXbh)n9i8vB*^)dMvs~^hi@l zd#FGd_P{l%FrCfX=9<Rau3!cyx8D9U4t=oG@K!(gP4HFkQ6;o2T=o6M0)xGK_UMVF@dZflBMO*7)1@*M>hV<>Rypo>x z=$a&fg5t8XRg<-^tA5Hs-EhIIq9f0i<@w1N`KiJj&zof0vB09<5BUbi_V!**3DdbT z*8Ft%NW4*kPFDS@is&<#(Ju778Oo@_knOGec7RJ_`#yz9>2)ayGg+`WD0 zrn9ZzY8$t1-M*=|Y5TdG>-DPQh!9kEPHsjAxYyiH(S}yo94e`MG*&yqS!b>BDtoNi zfpy`IYB|;PnByT6i9)&#Ztr4AG{bPDbXW^%JeWE8E z9XQ?ttE|8!5+t+I+NgGpR~KgS-m3&Ri%OKGLLeHG;@UVgRWvcBjl`R5VQ7`wIG%1^ z@hv7z#s$X;fiq5?vSG_&J`$>l>=(T|%s!Zuv0;dV)toh|b`DJ7z82r#pqF!#J*j<^ zd2jUkhl1U}dH4;$nQK4H!PxxtOa*yw+{KgLy}mcs?K&w~?Cb3Oj^kC~Q`Nq=i+I>se_k!j~-$&K4vBp7XW12F;8o915i58s80@6Wi-mR7t_t^by z5355iHz_1zO=JZwR3w)+)mCMj6n{qv9{N1$nWVInB`B9kOdfx(kXAT^VHjZD8H-x!*|;$(S%6vJTZ*g<#Upe(PZrfV3#a2$YpmoKsQ${uhi(RL|>81 zEy*VQf^p872;>VXcH;^wr8?%Bh8Wt68X4b4wgk7~E|ag88xe7INMPU=wWo?u%kwj_ z6X)6{{}VRIV$I3^KOy0t&r6Z}pA&Qdx8moUk^P?uCc(YIy}+MZ za0<9DxC^*7_zJSVbOJgv@Ki7Z+MosGSD=`HIv=1mx?8GzF6y~m?|3=E&n$tV@C5Aj(Let4Q{?MSRoSKw7kEvTj zji_@V{X~WcYfSRJo{u%B%A<2>Kq(ILGf{|Rr-$(q7XbEXZ8qHW^D=PG$U zd4#)+*GpO%HzeViiJ4s?Y3&Xd=U>>bR09+7Vxehfl10-%C{`;aDiogdd71#C*y%Bg zTtta-Ab(L}ATFDB@{RMSYzpEba#cyP_6|@mpG0l%4$V0UHv~U>DBf!H%iLo3R0{o- zaULsn6OjsSz(UWsLCHKH?R)MRfU;aBR$pi_m>f&1*X1LrL_I1LN3IsN97$7APtVRb zorg0XF{W@mNRlpN<>=KFOyVe~s0x~HH7UJ88stHxN3Q2OV(PbdL+3wVw`f@=;`2+OaN3Gk~>yL^_CJd$iHOXZZ4WE%EzMi!ZzLt}7>RI93Z zq$qDftiv>VJb4bK4OCToce-danKaSZNf%k4h%7_5OSM6}M)!%2Ng9$-H4F!RR>-VU z3%?|iMVU{S-tk%31FFG)LBp`bDicJkkZ;@^M8~=J^D{Z5j?-7m;p8ZXGxxQ}Qljme zF{(-5X`+WC<=6(bPfg3sg3v*>?uY|03Dg!SBQ<+LFIkH-`)AT;bawbX4k|n)BsEQl zn5cS1mndqp%rX>WC6u?fw2!kNF;B24Ps8;4M2V7s53u)~&%X>Qgn?n^OeuIel$M$y z!Z4^9Q&7>RoqdJO^Ry7R`q*XR?}RcURpcgAagJnlewCoRi8P?D9E=Hw`@gh(2!6I* zOLgpeV)-&~*bjHI?7^hs(GZy(8UhNzG8YVAyFCvd415wk>cfza3poIV4M zmsALAb%rW4x#(7{#*|_Q%1I1|I}O4b#W$L?!*+zOuz46w4(^TDoSoIJn3lDMcT;NT z$PZ^+4%0?@wN)7^ik#g(z&XG$?ENer>A10uSR@@!)Zw1ug>e2hbe=>%slOt-z;||KAAS0CZ2_(?Jt#24@1@4RALgJ%Qo_ zUIVTI?*^BHgWxity8@&;SOe|=zKu@cYVaoTS6~4=9LP`LSnwls0-phI22TPyL$C!% zXQ0>tzd}dwP4IE>R`7E0WH1ZP11E#K0qGL{2VKDz!RNu}z_sADKz;-#gOk8|a0l>1 z^alS3Cc#Q@3{XtLH-pE4gJ3ln2Dbv=Mo;hwa3#>$gU5pXU=qap2zT(i;tk#r91HFM ze!>__zi>5pJ9r&XEW!nF0Eh-w19Lp{H2}g3o1^vvu{wbTOE(e!t+d$21Ch%sbgeg(mF-(l@qp=t)l95P<60SKu z#H~}-pp+7Fq}QW5M&S8G-l`+4S+D$L^mPNrkaRxNL7y3dC`E}g_PWoH_-#zok9&&G z)r4-(iTJdqQ93?UeUw0lB;$A8@!;YUQgh<37J^0>zYV&$TXM~RN3$Pv>=-#W}Yz##jm_~XNiAh=Z|Bnuojx1R!k;} zLwUGdvhFS3y=`MQ-Oo+ZZ}xv~{m0{)m<4nFmd(jFoocKsHtD$Wc&jCbg@-sO#G@1X zGX7EWWn;2qyKj`QYzWd#IqZFEdsnp6%(Gz1wYUSBufs{;{8O)ZtZgN<#MeWUM&hG0 zEU>FLG%T0%*p+d@ot9lWyBlQ^B~Le%LOOdl7DIR%j$+}&-X!FU6w4tIA#m_xmYFo< zWeMZcjV-Q;3P)##6HW7Q`Mef+_zQ%`CA0vClDu+>CuOWf@a$xl2niOGU)_NqK5!u` z`O6J0n1j-9Vq6}A>qy)+2ZtHloOG`uHf+3m+}G4B=-&BJ@=c8}W$H22)NOmNXw-Ua zDt0XSi!+N{_>MHUz+r5S$(SNSr}o&Qjj4-=2`3$(EJ82fq6Z^Zog7QKg+bVrrScer zuk|u|uop{ZoL12(&a@v<_2XTM-iM7obSo0LczrvM-uJ2v&(z=WE`gV-*i7qbR8x2P zJCb89T{(OxIcT{(J3IT?t#8k(aKf9=M;r{X-taoFZ|P&6&MCWY{B7W^K=%GefZGGz_4g5=`~IfEMc_Qay5{uavYy)F0$|?CTy`WAQW5?6Bf>SbKrw2u|Cm}VU@q9?%=X^8#Q}Z4t zq%uCVrG;`3@1Xb=!RL6&6?O58Lsh#IZ-I`Pm0$WudF>VJf|SKF7n5;S(^~oTB!8b) zUY9yEc)`2bqAWIgB`H?BmUUe6USoD*UfUhENI%<9u3P)-X_=DVGRLIitLQ=1QOIs; zYZ&Md1K$7gRXy~2x`!l5m)J}!&#n!{i%R*hpwH2MpgQQYr4?Me@GKE=+fm$%%dojC z1yX``^Y_zw;?V|d!{pqeZx^Yd#0&8MZ>o8r(&GSBLjRX{CWMj_PiP_+&JiQedm%8s z<>QhZayAHM9p94aKxYxcE+jjG9y3LQ*B2Io@GLl*>BPEqxG_jjuMxK85$7mT5zYV6 zW(BFSA!3ZRjCad4BF-v#F;+YoLP84?2PZi;GtG)MIWtAbszPg&RqUa-I?;Z@DiQRj zOtC@Ki1nwTna0GM`E;FfBUuqy=&C-gwHWPMelaQp){FItYs3;OP)XXpWn`~PSerB? zh4o3C*`}iFxrJXo7T14ArTLBu+TeM@km5$>K5`wh-f# z)qrksE?bA>ysLR`6s<#pv}m@mz!eh;u~LC#YaI4UIYTu=pOmS0;?;M9Sv;2as!A7y zlCsq&)Wc48asn*Oq;Q&FB!;D1i%;TWopxeL?`L*Ab$8(@OO?N@*dKl>Efdo*tf(^j zy6>+YDw1dIe#%1z!+d9jY_q*9X3jJ7Sr4b1uE?&(PD4eNd_6M_&anPZ8tV32vUIa^ zf08i@?(RQQlq z2OMz~2M<}dFs?28Cu8f&C3L-k8km$EMyTGgJg$J2C}$6 zr!N+_`I0Kb)Gln;o|6Tk3$tA5LjDTkFK=->&c?-w`n{y*wnW(NHu%8lGy)IUQq%79 zuR7IT_NZ6RnZ!|8^p|3v$AbJ@t=5T&=*DCyvi!r1ylSYYHB}-H9(qFIF!XhSNsjwj zx}b&h!Lh{1dY6rdM^Wj+a{F7(a%UXVD6wTjqFU*5Q*t?)zi~0`a^9Sgos6Z`z7c(RuSn$)OuVmvR&M8;^_ zWS4toWAEe?Or@{57Tw752rDu;+)N;`yM%fTmS9=pDjN@a$^O3pi}GJV{*UqhevHhoGXT#APXyE8q2L}sx`3~P zuK~pbls;fJ(0PD+0^JL6H9CMt2YtYgkoms@{vEslydJy`jDkCW+k>l-|6c-L44w?0 z1Wo`yK2{&d0-Z-0KY-5|6A}ha6I@WGW<)y6i{scRiFay0qzcDkAEr9 zIsW^DA0W$rAAAm63tkHjf>XiGKnc7Rnf`CVYrw_eMDR1@`B#IT-~{jmWcVoG&+_}d zfckU-T=NG;J)GHUOd+Y1H;#?r4MYr=DY4ZY!V}Lg#~8LPC%HKe*2~QiLTZW=bU(4f zu&^^r2*3lO?rToXK(2Q3KCn4V9}7c7rVyQ?P${4lLCK-qc<8;sK4O(-tusGPpWz*j zkAz9E%?=qz8fD=#D}(zM!+gTl!o0CK=Omw~H@Q;9`1c&&~G zuU#hPfyqYs%yZ9I_h_H=NJt@gl?8qB=v(DRl;hQ7T39#3y!m3WnuT#lcgp+JFjs(~ zdVQpvu2aS>o2+Ru0>!b}A~8BO&v{htT%FrTrnLLK)^sC=NwqJwHHZTeiu4FXy$iJ| z+N;#e?C)E)jYHB59lz1oshACPnZ0#a zBBp%eX4?xbZ=o2yjucUeUoIT;)I3q&)E_W`vq2_yTDDEyJ?htE#yL{Kght}+1cliPZ+raOb*G>nV*c42 zKebHcv^XiH(GF8q<7lzO>|BtFH&{ zQsr8Lp5?0A)XBQ5JojzV>|qh_bYJ(q?XVIg z6G5Sb?64U}nY1Hjj-1ga9uipnMKGQzxG57h#2ZH2z+}q`bZRr#QElLa6jO9$uFrvy zH&3X*L`jP5Ond!giz}!CRrmBx#t6ABZz6Ot!PSaEp1;ZMWPw&v z%LE;421nT&kI0PV^E<@e28!#y3v2-;@M`4vHfRCe`}fBnp7H+?GW|=y1gL>{*8jfz{#)dG+5F|R z{}M0`9s$;XD)<@l{wKkEfbRa2-T%4ZIY4&*uOjb%0Q@631h#{7!8Y)J$o^LX#s5DS zD2Cs^gHM1DfXl&s!S|8%zX!emo&yd6+58pH?`EI`evAD7HZThw4gM7T5c&T)pcsFO z-zOXYZNVRcU(?s>?`yzSfO`VU;j_HWW8j2ZPmahqq7qm_58p{A*->m8Fo7&Yy!MH7 zW4r7GA}^^y8HFR!Gj-8so%_QSciA^T+`J}P>M+3Cc4mQOd~>xS(MCE_&l+8PPNIZo zMa~Ucqd9n}q`wkV zV%8Tr>nY&GVhav4TdF%4+8$hEb}fM#O!862} zo}KFUS+2MA>pv>qfK`#RQ)`ICsT=LAFBG+UewJe&C7<_~x!Jv}xr zHW+=Y4T}=01QeX^#NmyJ^fTr`U~ukj&pv@dh%XrGxvG3xd2Lhz6%EJ~fcFcdo%oCS zU6DQdG~4PRvIgqfP~H;`;AA|$rkf2}HV6>Ly1_yteOY5_p|NO@?ioSI43fL7IonRR zf(vw|=ZM|D7h*e(qHj#HlF{Xt#50PZ!as`o?Ec5`}2R!M%YNn@2GCVq3BqJdV%YFkfON`8S zg(msT(&n1Cwjs{DMb2e;cMi&4-_RAjCEp&}*Gmh4@&U_+Zf}s4hDZSD4%D-hPB4k2 zu0clMH#wpEdgKLYZQ0EmhiQ$oETd$ejx|{5cW@cUpn8!jaexOYfJ|Ce5caXoZX*b@0qJ`D&Ee!h%r~MHFUOqi^df zbHbGA6NfZ=qw0I!^|vK6lSXlzmi_-3h_NRlm&*UY$q#+LkIesd@FuVT4uE;^Fz{1k z|NjNA0gnTZ1(yKD20R{o51qi9z%#)4;9lU%=mFjh6c6w+um~o=I5-j90(=0yz|+7} z!3IzTtHI~d1^hjDBzQ0w0n!cpA5aXyrvRM?xEr`D_zz_NtX|-+_AGT)J+}F%`P5gq1l=3Ht7RHPkGf}ol-V9t1{6UhPOJQFrl@lcIxj;W9f`i{ROt;U)~l?-@+s4Tw3yzlQc={p zYBZ=?)AGoORZDl@^_BsB1MegFbv2^Ut(j`xP}1MBy`7c*y6QV}!$@jf zHCw2RovzagaO)ruyUr%58)VcaV?-BujFN#G0bU9H>Y)Iq4gxn(V`@$g%rUUL# z?q9HiQ&WK@S2FXuQNq+!z!^<=*)$+==7s6WmXh9{9WWc#Q4Y}oY$IdPSDTMFzr1dZDi6L1<$X+eD9nA*-U4~Qe%l5C1vL7 zuyd&zSkLsTLZ-icBzrgk269WgzK++k@r7{&TBqu-Sf1ZPSoq+QI#=ci}C4lrm?zOOb$JYN`O$~ zWj&-a>t2%kAcCuxn)u8NTwH3CPewCvikUSw)!1K;=45C-S=k12fAO{@^&3R^brR_9 ziUIIO@CNXDupj&}xHGs7xC;4R=l|D&HQ>JB8szw=f>XdBft!PP z?*A>w@s9@g1or?RMTWlutOH+0X8$<&7ys8BZ+9XmO2A#P`FK*Im6TV~8!00}AOln!!w#tU4~ngnZ0= z5;BPfk}4!w=~|muC#u?6Fu$zL3+~GG|96`A-)-*(lsz!saHmu4_Kfcs(F;(QhqTyB z8XYd+HC>TLB`WICxBJL)4oUI=yS^KJ;pW_X3sq91q#?64$}(6&*_34&Y@csl+EBb6 zJL)gae8B^gTRI%{Cpgtr?PZX((uiAyLL^IiI?KTEm3)-*)05-PDh97dpCYam@7-E1 zT*IkO^UDJ5Sl{(o(&7$kZIttWDmV@r(;-3de6wi z@``M6N?HaGjf6$ChK}&3WsMrpBhs9djI2aj^DR4#7br0rEV>>3BGsyg5kSn)$H5`p z?VRMbTLi$rsmcAiVm%)|BCuH$3Y z#d1z_JTT>(4mX}uZ;l-?)%RHx3j2|bUTR3+Jg=+AFKEnal6#OhjpLBBpO$5(yoM>c z21N)fW1vFY>8c^RoxR{Zn`F?cS0tka4_>P+?tCywlDVOKD>I3^mNLR?T1{hswW@yc z=vp_Sn^eD!u&SyXY<|Vf1^4vW;2usTeTEAYeYWo{XC_bUfEt!BTU z)bXatSU@_nl87mj%}!A3E5% zY~-Y|L*;|+d3OlCraf6cVJRljPrY61TvlGgx!bkJpO|~5uX}wTTnjgpTg?Sx2+Q3u z_nz|I%US3Bxo_FEV_~P3Pd@2Egfvr$sqdl4GR;xExaZyM>{&Mw?oCqGN%6_!wdob6 z@jLWldxlPjr-P?DXYzFY*ycaZ@*lhX#}0n@+p@XWsKaz)t|p(GO6M}dy?XcH|7dM7 z4zS=V+G}Gvy1F?vS|3|R%>7YJWOQ>(^(MY{9PwEX8e%aeiSo~3`DQrpt%4hS8BSs_ z$QQ4&m&4GhYLTc}3;VdmN`7SWNi!Nm9?!lK;+2<_Rs0+g4>!Zz;B$&VCsN63!o0tK zN6wYtPB%6s%}XJ8!lk3l8J&I$mnj<8oT@IuUAaVWo0%dE2X@lQHu_>V22>gIb}4#P z>CpSL>h)HZ?emT=@0*7nI78kTvlHY=kD2WzRUHP2CTu%oC(3risC8~nZH3EhVKkR# z@bK(S=_Uyu=d`8xah&NaXcP-7aIfU95ZAu4YR^!#w}dHs6A49KooDfa2;&k)c8g+# zv2vLAmB`eiyGWB}x~N1{lC_0kF1YYErU^9E;ClEFYlydc3~qAQX&B9(hk|S<@up60 z+6h`IOzq>NCJ&Nn8V}EUOKxTxz*D{1F8DUh2`aG zD!r-GrFk7znczh6h8`mQP(8z(XDZ@l)pfZA$|@o>aAPQer0ir?pI2f})Diw4n#NbJ zz#wTs3(H>U+L-wIbem7Lm{X?9{J{<1UD`X1x_Qxk_&&ap0gz7Ht9l8x@vi+ZPk8MD|09^wHEZW5d@`HTmh31COs?! zayK;#giR4GgN%4Tk5pS1gO@-bBSASXn_lOmU@2hUqAaoMEpIQ#x4gHpz?S!}bz4nm z`7tM`#ALS^>|IM}Tn@*45U0%Htf0NtuEUy?OBxSSwlugauuk5<4D1o{lnLYWyUkQC zuZLe&gSR9-N&bHU+u2FTo|6Cf@)`Po%UnBQ_3A_`$1N=3p zfd>Hj0{jiQ7_0<8M&^Gjm<3Z{KafsfBlsb5|207O0o1^S;50A{K8@V}@8A>QSwQ#s zoemxV6c^wx!3)7TK)QgH;4a`8@C#)Bp994PcmjAl*a<4&C&>FB0{;qh7r;DtI9LIG zgS`K1@KrDcZUcUWtp7{!CGY~U9c%$-gIj`cA^U$5ydS&|ybwGZYy|fJx)b0Jz^jn| zcLV7Ip2OI@%W^zb@>C7 zF%!}s9I6eCaG5eW_^!%d{>vdB1glzfH6Ge9gg)cs^{f1u2FV#z{iY8J8*c=nLA;i< z$sMqX;&0WV^5xtpVEPB8G50^ewseeV8LN|LS3c<5rQJI^a_aHx3hzfhv*;~S%wOtS z^c(zdPtuOewJU9o)rWquUfbhp77c=ZALKFBENM`Ww8^tfdexvuQhl9C6X#y>0l7(+_DPePp32|-+ zmNXR=KZ8zLrB5o}V=gAoG80nos0}@m>+3?wP~RSj!xPe%lx6(%ox)7ylvL7*9pBUB z%{}MrcaE|&Z}j`A$je9k-pz zP#Ww}tAQ%Yl+|Hhdth-lhcaOZ^te|xY52%OHXcz^!7#JgK!Q4qn$_=>l(jx-_OxJ+ zc>$7VW^%7em&e>ZJnGAEI;5dv$^lP09}_ajqfG1esq2o%qQi1l z8#(C^lRU=2m4az@vKiCbe8hj#zdQ9oyfa{}avNQ)2}3G$9i@iQbGfTSg334Egi_ve zM){!gLR9%=iXZY8=5tI~B6Wd15BDR<_9-wUai zC1XcVfoj+%hGp4#|`kJ_f$N8k?A}p*HW)9$G!xq1%71| zS7lLUG>wMIAwtnddrOTpZu4Bbwq)Oi|Ll7+dKMleJ5p+*%)O}h`x`x#EmqI%mWv@8t(@377;YgOk82@F8UT{on!MYsm6n1)l=4`M(J41G~ZTpai~+ zOs{kP9|n8CgMjY$n*-;A+k@{R!@mqX0X!Z&4x9o$kIeo#@JaAE@BpBD{XPVAZvQFZ zkwA9`zrG=Wflt$5-l4K*9mCak#L z&l)HX%bR*|O=(HxTtCNTyLLvlNd>~Sn@f`3@Y@!eqm$zVVoepfD)YuyoT{D{2sjQo z&Ig0qH46IEiK+HpLOhD$c70fTjz2Tnc`eI%BY3!$#kjJBf)kr+a*O+b%CEV3jzH`& zD-Hi8VU5EkHgr)drvNLd!}ZB=SmQ%MsmD$?IW}_Iy0vV2DDpbe4Ndd+@zoQ?50we0 zpEfLTYMvW=A=g@NU`J%3(G~Q2hWMf&yOC8sNtpX~I%MeLJwpdLu%QzfLl>9Vlq>S- z39daU+nW>@3sR1y_-*{TLi4XtK7G?_+eqa?F^V&V&ZKkq13cX{$`n#ej}eC0I(%Yg z9&yRP68+UAR@y8SOC}Q8-+)QU=&6){%HUEBqFM^5jH|u+@8J+T0bhcWRNVt=Zyr zr$NeAt28d(NQ>`|KoZ(Qspjs+J?`>AA#Chf&uM7>fG$>0?+IobnQ(-pIwREHEsI*i`N>9JkYlV?W%o{_F` zLSs0qYsb4sNVL#dmJU>}KnLAK&_8phoSwOdNz5JGEJ=!(h9-HSDRFT-Y;>9tUJo7pFmZcUf}rbVp1sp_t-s z(wAJ8Ug5JmuG&ND!w(sr_$g|?5F;A6>_8pA_w2kO?7Zn!g;kUPPeXotFLJ2t|Ja4> zBisMGz$?Maz#JF_@&Pyw+!A~jx&Ko@I)RUYkAlAg`#~Mt7hH?{|93#~02L45&freq zgXjRB3Z4Q+z;(#|p99wd-TkNdf2V@u!OxKUuLhTawcw864pg zl)?9q^FIcj0?q;t0rv)umg{v(jzI5=av1>pEPN+lBu+w6*kYn*b#ltx2CG#SlG2|9*k_5PSn`=3mC~9oM7;~lGB1^>Nj+b z+X>ESwMw0;V@4}1NQi~b#8);dOM`w<_0k%HT3bH3a1^l*g9C4xORg#|_ync}ZKbXE zjjWzt?%HXOj81WvW6Zj;dvV62R>#8xv6R3jnbHRp|5lj2J@(&jse#T!S+jU?x{2$p z(1)FJNF8dKCJH4-&*`qB(HAI^?rMuRIx*DTmRp;4t7q^Ax8Y0fL8iGD0$k1) zbaw-Rz^`T|bvAD8?EEO6=<()t_J&=4pJdi`IqSCUB|(lF5DV#52q$K`%lP&H!nKr` zy|q-;8s|DDS`F2&R)Z~0{;KqTp{5H0m*WAw(~?Vwf-=oXCF;j2fdf(GVmq2{@yAE^ zb2DK^AIDR;LO5~=0D)^&DMa1QeVg~AcRigG&{CJZmP9p~qDC~I)dP#2Dqm}!cvcJt z)zY1QvpiM=k6^D46(1>Y59&5u$t1s;OqxVr*e8t*WzP<>o~|%*s;54IBU)ly(mp+| zF*`v|9(UY+&I?a;TX{*I%_Lr*bdavrPSWZ$Oc_nF1ImX!mQeQq4#3a9rn#IhEfi> z?;4$*oWa1LMH^KjO9EV-XJmb#bxjf6(`=pY%9X9ev!88Dp@;GkFk*-XFV42V6zid% zN7TOk>Ofm>trn&YwsF$>UDyMxhuWiEebg%t8*xAFU+iXE=ak7#8|z}}f!(F=Ee|_s zrvrKHI5tS$O#}B5URibu%DYMW#IS{jN~Jv&=JS$0)q@H%j`Ma8xla1% z9NZPh&v$f$%SUC{E?1U6J@*kPFpVyXumfWFqG-X@S9ANkHkxxbmh2U%I{Zdu=|y#a z?RWzPRppBHYg--qkcqh}22PONga(XfmwKJJSiw_Q}bR<`>YQd<-fIkN&FRRWh*44#aAIBw4Y+U8Y9E5hA2jdwc zPqAs!?lLn$$ z-WC(C-6QS|8%v~{mJKsI+?lPyj3QC41+cvBQ9mkbjqw~xc5q9}kE>R~d=vlb3x1E4 z3420hFUo~2xzwMPh+IT!e8}-n&gN+6@I@gSR%@d(GwCxMjh+VB-f1%1Z<-)qK>B@X zyMM7XL7v4A=)HcOw5^zz==Cg?#^d@I0W{f3p2w zg*^WwaHYOVC4`{m>g;@&{ecsv4D@()fzzbTN?187aghwZ zfHpPT>_e4tZd`ZY=Anh5RTA>XQ|)Jbi;0+7IgR4j9g?SPPd>G6T6U25;(~2P!laB1 zLw*vr+Bq-**Ad^}pqC{TTgFVya#H&!>E7t|4+YnbumSAB`=^@KvH9tl3QO*|uSUIl zeQ&PYWeqC!byl*7<5jK{P2uOl4`|JnB&}51B|Q>bHP0riR!I?9|5Vi>-zYK+&2E?^ z&qq52))?nxgiH|e)q3KF4ST>{MoqoU)vKB@6`>cpttXV2LbCYbyC|$7X#9(^b@k`m)gT6@L zyBv=XObebE548>oJ!6#dEOiZA5=A(d1*Fq#OxxS}@8PGqzjeti=$bZ*?MyXIsu;54 zY^;fFT!o6{O31&dx3ixW$(j(gOH$QIvMFVg6@OnLuZIvoVYH29T=!H$A5`SPWP83t z=gc+s6E7kA;>t+1EB#w+>kMk(l;EZ(pmqXYO7RuPD87-)C!42!_k2PiW!_H1<$Ewl zQ@)PjccHH8%#V)ZRh-LkCBDEbhtq6}6(ihya{)H3%pjND63z6fmRCxRvl1;uF1Iuv z|7}HVB8O7#rg7IEcjR2Nw>jF7-ZpcAGAt=uf<(1KExYs8awBtG^h#jh7RBJ2WIaC< zJ8{m#ea81!#+6pd)NF6A)V_K-9S^UA6|tG5THV654J>BNvDZ&p4f714G9V}zO_do6 zmVhzIY%@i3Df@C#HLC=5>sPf#XMa`;Y_gd8cCd==^`)8>mcEz5gp2tyQ8rkG6}-rIL?dZjKRWx|+b@?n-808fM2R)YKT6J)w@qPKqz1CJq5$V|&_;=C zQSad_b_U}i?Z()=FLmLb!CZdHWVL`*aSOKBl1DLk(6miJ85|b13}o$cxjglDcLi6l zzM0#rFOzVAD7xY}I|~-;zoNVmx5B*sz!z#(4kN1!LA7TU%qXdV6R#N-`x9@tA}{bG zmN?H_5BGOh@X7w3bYt8Q!wTdyZc{MOFcFK zkjzRuVYN~-CHz{hoVIS;>8bw0GU>yx*<~`h3S~B;mBd31?GBa5RV+u1lK;;}0{b#@ zX*T|!?ElXO&jQZ``@o-q&m-?Y2PpRcUa%Yd3VC0#|K11Q3$6yQ2hs=VJiuD;N8q;L zf6xbf3j7E7B+$74#RIqm+y%THeZbqmIpASHJ^ah?2jI2n33dj3!EO2Z9`pn62LA#s2kU_J2A@VR@Sors@DVT#YM=z9 zCwLopD|icd5{S0{P5l1#W9S>8Gybmx(i>a`jsrhJ2cUcXz6o9g9t~!|J}?0u0u=x6 zKHzr1#)4uqAJGDmw~GMUclb!|2{f#8dVbYJVuGt}YfkUrTGBxiPK8K&57i8q9Ocx_ zzV=uu!*u3RYvagYv+Jd6wTg;#S~dAAevM!DIBsq{94Ut?$WB&LJtRokFZv@)D|Qxm z71RT^+L_u_FQ-e3{t8iUY*Y4Tk!QJ44!?&DUW+&@+%&#}pop}f6C z7s5nob7O;P@~BvVPIVL`UWGX-$L+LvNd+vR2j;Pux^|#FNic#{Rl8CvEm}=bnak!IYj-#q zE51vb^QJSlZmnPe+^0|%opw!rA3``11>MhzOcmTD}78|$%Ysxh}D;^N_sT7 zPE(H|3(mJDFD-B1l`2Tct51x1WvPyj!*88e(KpAHXXfBq#aY!*;P8X7LQLFcESkQu zUsgT1q&%`8Y9Qj59hVODJHHbjB(6fow!1gj$Kq>&c)WBnW*Nw-dFNX}LgoNPb5_Q3 zD6?1B)8_D?loHt*c3}t-&hddr=^sKo3l=1!)$m=GeC?~q0@}{OW?s+^(Ji4;!M8Hb z6|FJOC}|JaF*QFiAv^TOIL4L2-Vk{MwIFB-3tK};kBoIr@=%IcK|L+J!Frfzt%nz*&SizU$v!;!KmwUEX+T=HFdDAl@6t{dMN8(t&t zOosd=X1#-x;(=9G;1UVqtd#tJI)dg9a;N0~*&uX&8d?9f;BUcefOG*50Y60EzYcsI zyaq@Q@W^Rb3pa}gcj8=<_+7^A6u(lnSR9i>z9DDy*T(n6WSUU}Bnw8R%v^D`eaxo&xhV#k z{h!-Z=W$JWnp3$B&e-NvjeUsSJ01rMd>P9@cnBXE?Qi-r{!w!7da`5clqil_=hmhg zmo3`Y!GTC^?>aS?c^1sYcBDV^b&oTe!=d7_ww2Hl-%U(-iI2{(1A?ADY&P@QuJP9M z6NxqK?bC}jy^~3lJl#|Z>FnKD44#Xq>AATL&AI7=LZUR_cF4;T#>_Uh7Kfu!emHSY zA1`qGC(K(4bvBYbti@RmorAmU#|L@n%tXy6J0j(yVQnZ zQppS(qHa<|;mSVXQi&NKf*LPSMH(@;tb+&5X|^P1+q7xg){ix9ac0p~0Slb^R{xIa zWN|9eK(UC9>4hBb&=4en9a5s@2*Pw*)yc6`obwmhz{CC$RVlkntl8aOE)PyQGZ!)k z&le}Tw$ErcIe_YA*gcD}l;tx{tLPNxOma{dc%#@vbJ|g@cx4?0=zB@7XPWI2*u09> zmR{udZKgh*3W~Ulx>IYYr7MRIB_IC5N3ji2HRH9ovClf5r!)7Vl}O6DCNYrhQ4_Il zeC4dB&qfooSP{_5wwOY8jttwa^u!6fXiEhGS`61=T3zFN5V5q**7S{y+(>L_3}v>& zp&2BBhf2t!bY?h#BJ9^t4;J}2?1ANu*5~AVJugniuDPK^@lA*=L)gA3ByL)PUsrm4 zdeWxJFC}}g1LJ{U{}SpJF`bI43+7#DpNq7kqJIEoS?(#)(>U8sWa801RTtWbEE${Ay8vFIcP zvJ550C86hJv)w4RKY7tRKvhZ2PgaunvP35E$akAp>!Oc;5!&j0nb70Ol zy__CE9`4ckRaNNPQ{vXZ>N7q$F&{2qwx%~tM^Kan^V6B`sn=D2BxtCnF+VkjYB@g* zW%u$UoKAuv-I|hW6hC~4BMMfoh4?mnXR-u4o1(qU=u|d`uoWg9Q7^@;^fT~`(cd!H zDTx_RZaTsD6l&M~EJ+?ownf`lE~o6Nv2w#+&gkeS=TIhGBCV?`9x2M35bH3F3z6K2 z(gv!4-kmNQO(u=5#*sIs=+ah?oR0w3v!Z`nMhh{Vw+SZ3z{Cnj2gd1LFcemEhG1*4 zC8laS-5)ips0g(c)y4RKcS2T_{6E7FeZGRM{~qvi@Dxx3CxT;|C@tvBKv<3ybZhpJPSlU!PEHt5kPtZ#R&Wc_&Rtu_-n8i+#P%x9l$?; zHvrucuofH#K9BtWIq+8SLU0Ax4K{!^;GW>_;BMfq;J3*C*MY0Rd%(Ma^aXzoX21lv z5NrjCFL-}&KOmjKGr=>!HgFqo4Z49>f>(eSg1zAGK=%eb6`T#u0;hlX}?!{;SJrs?MTV zX9WstoL<)|$G33kg)FwTuv=nBlc;J!uNTBXPRxt{WH)ldSLQ`74nkzx%^Mb-$Hg(k zD|t+34qkDN1!!v-A+ zhIaiOq)Vl=2MrNPp$6B@gFU2zXzwUA~jV4GZ9Bc zexUjqo)v`*lPayYw)m8L2|btfi}3Mqr#;A~9Rd-scqO^{TUGHNS=NUVTP4*feiEiA zTSc4I{pu6G5A_Lu(HLm2cU3%zS4C7?ie+p5##OiLY=C(#g#|QrM`i&XXhHJ@hZRjF zny0UwO|E)0b7^LS^#~VZ%ONbG zUV|kx04Qf|3Ftnb=JPSJ?aj9o3a2)WR^rkisCP;FJ+xqjTylA6FENkhnCk%Ah#>J5 zvekA{I8SH;lq!;`vK6!r1c*_3(y z-79K2_fQrFeRXWFw5E!NSJA11R{PNnF0wswEtguC`NTvxF6tCnt+r>VOEYQ*jx(C; zN{=MoAO64c4M~=6pl^wNtFX+=5)!kuDyiJA8uaL&Ws2?fM$fW(RXmJ+P>c5Hxk3W3 z?-Dun&tk~mU7ZqbgECBD0Id|{rpDab2XvcgCaidX`GG^RUai# zpoI9Hf|J%Vfq0wP3ynk*8y}`1;$9TL4T`tq>*8gw$pd5UYhtXMW676kYRTEUr8arw z>ASv8-$*v3!Yf_hyIEBC+0<-pr1?l0Zr6@Db$uzXuE80GJ0m zKpETxd>6U?JK&jMBltG5{D;AZfMk66{FlLZCCh`SfX9FqxBy7TKL$v~ew6wf8T|vG4JN?(KxYE(1QZ+aZQ!loE#R-fcJN!t>frm} zi{N=+6Zl_b^!I^xfmebnz#(usSP#B}to~*2JkSEu;1S>gpqPN~K~A3liW%^~$mIVF zn&1rZzx2EMemB4G0%w9>@b@>s7r+N^rrljDKZnKUr}rXgp-~DB=028XKN!RHcLOFY zDYMmR2!}kM>Olx*IK2vJ*v_b!_2f|XV32f%H64Z%${IH5{m@Xo9_6Iu#4)_3&&yQe0&d^>^$U^@VAfizTU;Sw zckf8Bz|!Zn*^R6i*jjUcPPG2f1yXVz5rEUEgv<&VYP_gnkEPQd_aaj(Lf1Rm6boVo zLQa}f57{M&{h#a!LHu7rq%fy-)mGv2hRgUaB%%M+`6+K=%z-(?Y|_RTqzcnrWPL)! zr0YegaP?E4C9QKk^#Jr)dP4w`U3nC{j1JG@s*c0@U>@BiD_rFVU0mWyCvh>j?^VlZ zw-*#f3ayQdx{K)jre+voB2Vc&XTw67n#DQEuZPf^@f50;$J)(Ki-6&TqC?38S16%A zU4>|wWmQm8)LWcBm~5=(DuJ*PBoiSX7b2s!m`o3UVUjO>;^#%xU-)fK;Wujsha|@m zl8R{j3XNd}I<rY!qCb~(dF5I2nO=LvI@ee%q#ecm zRD|K8r9^=bED%>s#Fu@=*y(1DxEra!a%u#-k@Yk6hv-G1Op zia6{gyx9g^WybNOmOZ$n4k#=vxYZSiBi1K6My&Gp^sZ}T&u>i42X*ZDP*m#1w+L*< zQ?9a$UmQxu&W3ZO%yXCVpnF#kX`+O)E;WvU^{k9S)&MWC97S zP7H1x`cPkM<#Db!_^aH{x+@_o2E|lUz@OLCm%gGrUCm6T_Y~CeiT1|25oP<&<4b8KGUvFGiUo zc4}U%Ph2CG;B#!(TE=;|64oYVU09#Q{fO_Ofg5A{xVB7=`4t-xSG@FPhqTz9>5&Sd zfEWf;bE7ApSJfF{xZ+@}Fy|I&M8g)G>O^yONJ252Cy2aWvb*viEs{~MU>+=~H*(yZ zIlkpRwQc~rjpYAx7?9T^YfAoCEI@q}`|p`xJ@^f>{L4Wbw7@WUAh;L!6YwQu`o2D}5g{`KH>pbkz3zeKKo4LA?zzP~Rc$A1ib6wHJB zfNPQ8<-0F?{AO?qAQ^rEoCr<;*CMkkuKx)@_xs%ryahSE4yr)D`76M+$ma6J-vZ7C zBjCDY_#S)~d=wl28^I}H6{vt;A)|j5$mjmW-~f0iI36gb|E<9%=}-0POTm-Xw>>{m zWLJop`B@ZyxC-m6C6X))L`ApIn=}2YffE`X**j>DeX;53j~|$c4Zb|AN_TzHp%-7( z`6NyZRWSFCPhN_-YwsoD#&efqS-v)~eE?w;pxiX{#INlMVSTCgdhO7y~M zt|dy@ftWbvL2yv6bdTTaC+mUV6Qz2tDxVhbi3-m$W?u?m6+;e7_I=DPy*oeK>L9v$ zztFK$7ZzH@1UxkSI!ORffq(j_;e2C4}d3AAcfL;ngs@&(8Gi0|C!OaA`n23-zDGrC zgE`ovRNJwIt|s}s)abw#q*A-YdWrO+cFFYa(v!U*qRXi--yYi6OB;X^0!r5f5$>d@ z(~|00N+$?KQr94>@0*-ZP$zN3Hf-Xi!e$$5?D4X@`S$nmgmk-09o8%9v@%W;hqv8&&UEww%Tk(o1>c*ccHhxN>Al ztTwliNdch93#jp2^Z01gjN4?5jO!7Qrh3b7UQYq}9{@U$J)OV*W=k@MK+*r?oZ({*>Q;WQu0|6db5}fM7O#<9=6`Hg+I{}G}~Ti z#brJ4112SEnPE+wnkR;w`U8$_Hpt*f#jUBZ8+HwIq1NLZKxQS%8Y^O5wVuuS@Y>-3 zRtzWaEl7D`mS}E>^0-jM`GTkjUl1F@aeAa!g(eT>n&KH3@c| zfa?^-nnA@%Q*Dd#OWD(r4uja|^+ZG4?{(RhR4~iO(9b=O^zTlNQI@L)#Cuil& zbcKI^bPR}+|0j`6Wrvphe{qmguS4$t0eCOC8e9b~0kZr51eyP<;0-|c02~7H|6dKR zLH_>~I0PODbngHEz)!&I!4B{z;GW=*z*Wfo^6P&!klkOt{cAuKDDK}Yz{|ll@TcJ3 zKt2FE>#uYEBj6P9K=37W0lEiJcLhjCAe;YrU^iF={uAAR{Pv#=>YxU;fV06J!AH;& zJPYWY|9bE%bOi4KF9k0F7lKEC&EQOMXYh4&2CoM3?tpvo`|ZFVfp?)R_;VoL!9Bq} zz!%XCd;weu#=#o!Q~F;0e!3hLf9<-|O+tv}%yCvfRjWDD~Qb=6sB({$GXb5_x7H8JPagSJr&vw#ni z=dkTl!;J$ISr36sd11dW5i0Vj1DERC=m5pqL@jS-mU1u|{p}K4Vd#oRXCh}!=d$hV*C=a>A z#cd%}ep$Djj8{>Tdz3|8$j-8)ssD_ZnmEu5TwG>*a}7V-NOxM7n1cQDr6(CvnR#KEzb&-rJ^H7R8R#uW(jGds&F3?^Ht| zJEVvZXSP}vHCQeLacmQit25JP&vkj$SrzxFfpAR3_4KsXsAw4M$7Wwn1)jNJ%J}2>hC&%i-WV(KGn?_&s87N6W0wnb!(P} zn?ZG;idiB4us&p5jn5I&6x&sPy(S69A+A=@^?MCJ8g}^eB|~GqrcKW&lkrm@Dpykd z&Z;AvMw$F>nAzl0^k{7lEYVWPNjy4usHDGGvE4DKmsh2)H!e*v;LL2Su@aMdxFlxR z5~#u4TAPRn9C1vXQxnI6Y~sQpfC0U`mfUO8n?pO*?Xp!g$IhS<6>q?*xO%BILky3d@`$NVn-rXmP-q}&n4j(IC zwvjvB?B2Y6=zhR;KI>8zCVU?fDSGmV5K<4D8)Zf(lJf5-bx!E%Gh)A=)bT9P@{>9t zt)o4!lQS8T9PP?3*{KGc%1Nz`d^Q)<^hPR@gr#TRmeGy)RF1pc(GkYG^f>v>J*pD9 z%8GnK=r$cWD@f#BH5&A|i}hPZB3iopuDAHMq>iJj5ruBeRP%-s+?MU_tl-vF-$6TF zFLu+YHl7=N=``j04JSq5GnxFmx$>U|DEs zDWH}GcV~BI+oiiZ>&|XVX`3h-QH;TWMAR5b@PUs)5HLIxOw@=56k~W8je^kxNWoVy zqJnf3GLw8M zVbBOrQW!ceq^cxmk#ims5g_SQa=sE;D;XJcth96~?UsymCPP(9r*&iH#Dle*wdSn% zn?+1P!sI2GG1Co$l@>HIME@^?w7gfcX6XNoeChKOX#9u3gCMd45beJJJPB?85pX_` zz5fdEJT&}Q!Np)9cn5eK+Wj%`D7Y4!4oYATH2fF925<;?4x0U0AV2?~1;Yy=*ot<~N)gBwAbpjSs@Tw2|Pd2vtr_}+3t8!d%_1Qp&ye-Gpp2h8Gs;diz? z2Ez)L`wWk0mz8jF02=?y;Fi6}Kv8jhxjc6(N}NTQh_kjz{^L}`%IWA$#({7$qyZJ0EGGsa*27ysb2o?EnpJ)Q$;gu~C%Es?#ox z2*x0t>C>Rv-qEKA+)hZZia!6#Y=+TVZNhB{(h?uTu3Qnhb#63QlYEtXKeM1uE0_Z3J-g-r1bo0DaC)lejs| z;z?^9%9OZ4)Fstd?A^RGH)gw~v*N9Yjo_R!Hb14E&V5GB3IBuLzA9zjq)ZPR{B7+^Y0tYsZU;?jqh;z|WAFr&oKsGJdfZ$B_!EfV&@=qUBqS74Q80T^AIl`p%j8pXBQdh2iWi!+ zHwk~ZspI;eAh_fCq~jdyB((o4Q_JlCP%Uci4Qby-zT1Od{-~ef4A?+$nEZbsWZWar zp`!mQeChKNbpCh1b|8O$=YU?&4MhJx0j>w~`6s#mFgO|<4qk%x{~?gy{~v%+@EUL^ zxEK2WQ{X!AF>of3eE&8u2v&iWU@>?a`u`E|5RiZW8W;zP`}aohN9g|tfaLt20Fv`x z1Y{2|4=CQ>AK(c*2yO*efigH3oCxF#Q2c_2!B4>*U>Yn1;uU-w)Ikq80z3#m;G^J5 z@If#M-UAK;&%hT@e7|W>0~dl7U=REN`SJffxCwk1ycaA5@&$Mc*bT~H9q0ikg8AUh z;7ve%^CQ6Qw~xdW_31%p7S)eh9_HZjPm%JiCuCm7k48cnJ?-esC?5$U!_iJx_x?(> zzssy1zSK!ZPZ8hG|B45ExMVt2(N9>V#n1vOLMV644J`!gEDq%brOG=UW@nWp!=^&Q zg@JBoWfP388nSUlR*l(sU83^f;3S6XTHu58f{?x31{4d_lRSObsju3cD*+W{6w|DF zRGGrU0zR&mEOtGiE>HG%A_FsA8^ixC3{ASz-Kooq1pZGZM$1HG%PIHJrM_VlJ;5~A zbyuIA3W%D&$PPFBST79^_iHCXji;O+aV66aSu0df4aw=rRLPsK6nc_%Nfl&Af9bSc z&ctNhy0vv&NkuAmrKMfnm0i4zB&Y03*q4enogAq~X>H_0ihHG{r}9jTrAoDMstN_Y zw2QZT_jM`vF?HwHHu8p}2r@g|sg)ux;q~74R65606WA^1N5WbwjDccLElWk$b zG|noFvt9gz%S?I7#g=Ld6SP?HYWaFtr;~^}jvZsp`mi%E)3eJHacyURp`X6k^VoAv zvtOo@v%V>9KIe!bJ>O7ach*VdBrDBSeWMD7EK=2OC4-bB1r=h0vrA%-&;5APB2Uly zhHK%$$Oa;me0g+f*7r6~a}B4kLVA|$aAC&~>`#r#9nPT%!_$Twv(`Sk+E1E%2j-`p zot-&<-M;L4)(w{zuN<##vn6^YM4pT?U^OgnNoB`E8NY5RYg;EPoZcChORaMuORZ<_ zP1Sl|SIGwuQ~vS$DifEkX&iNQ%|S&r8*l~7shV(P zg*Br4TJ63rbYJWFYHnN2(qjDz-?H^9qJ}jd8-3$uhc_f~3ux?)NVRvDNNBFd-7mCt zE5urGKLR7^)nf@l(Ja~;F$gfbuX@XIRVQ&g$QK6{KOq^)WZUK{+*E9<#g#S9V_3Dx zNBpjnQ&!PupBV8IL@=Kc-Kx1J4z53$`SyO5c-g^&ONCt$AL1=4aJHHDv^Q;|x=X-N*o(A>;iDM;>I55ZW@ncJ_=Wr*?dgZE+X_n#& zdBJ#j0!ki@@4o18Q1??dQY}&|yn@1{rpA;1=SeJeeb4slCyV6%2nU+#+&Jv2@6Ak; zA=A&mTsRK;>1PLp)AkW9U2&^6^uf%MiPQ?GawT8eSGQjedzq}f$P8skRQM!!0Ui$|8Ie7K?R%vdcbjDH#Gh^Aky*A z^S^WeJ_)V{qWPuYUj!$B!+?AOJ`TOF7=KrSi@`=9KEUsw^REXV2UmlL2e6X=j|YE; z#=jTb1MUXffb;@h3+{)$zX5z2$REHe@C>y5onSY(23!f&gOk8h(D=85Z-Os@tH7n; zXfPi<4V`}rxC{({3&7jKi_rOZgKq%&{+F)6lhF51fLp+Ba3fd-ehWSSd2lh<2+jdT z@D`vxIGgX);56_ozkdPbOYj~LKANlgq)Ci%^YS8Jr3#Xa*mBYb4rn3axI}oKaA8|< z?(}Ly^|pLZPvlN7^|oPA79~&k>`Ib6I(Fek!V2jB_${sAg3pV~F5??T5aHNm-c>2F zdGvB#wqbEj*25XAYO6kW&=tIPw=W;7Trf6rRrtUHP2cmmMz>flT^UuA*%~)waFJ+&z(*yRT$zvfm}bzhR#{!(7_p3oNR0 z&NpYn(a2XL$cAP++H+Ekk-~COMT5HdpSDSj5p0W^x;b5)z z_&`&x=uL%ncf){-197K4WkzFgV*B{kvC($@S!MnW@e;lgH-H2a+c0g@aeXv3qQRI` zA@qrdT2>}!>*!QnvAs8OR1Q2BH%b_#HthBG$Sqj%Xg$Ngk&RA%mSJP}oxk3UL37@_ z*RHWv?O$ahkCt9zLoVIQi8}S#u*g}R`IDXmwqCfzQP0fyhByml6%7~v?3}~JrQ7G> z;$PFaO3gT2SbjBho${8s2ds^^ZyuR&t&}{jsWJy?fd2>$FzKdfV*8Sm4RUH^GAR&b zBb$*FXbzy&;?b@7%wO19z_)H;=OyDi3+4OLw;|4x-gof zJAGS-AOyd7s9MCoE)@U5LYuLi!zr{z3`Yxa8p08h*Kd zcDdasb&LqCVS9~U71k>Kf`jEM0Xb0gY({NT@xN`xBMxayYyeYS{AunbtP)*z`5g)7 zd1+~PSHtt39L6WB(@1E{U)oSkdSQ1$L#FB#HeM?T?P9Ig=h;4VmR;{zqd+j7qC^Cwlw{Vt;jn`f~Q`y!3J z(P%`i^X^h@jwNoS_<1STaGZ^AL(h~BIi;^>Wi%JBRc(_Vw7N3qz>P#(jRPz>fD>sN z2C{oaen%`|>X$UBh4Gxo64^?aH(^|-QiSoXH>SEjj0)Y-$BG_3&N#N;IlGnk6OZOZ z%J^DSYLzRIP(Pc}__3Tw=C7nuvQ;)FX4CKmm5<0@@6FVcaqar#hGymGm*nk2@_lN|D6z)z0jMI|Mz<$a}RX>kHHP#17H%A z!C#>99|npAa3pvR`u@w{LtqIw1{@9MgP%dyPk}lZ0kY-451PIV#h@JhHxm z97erbAMv<7S6=K=;=$W#MwjCN;-T2muJSfPJnPdR3`d@FfuG^OK)gpdNQsas2g8v^ zcCO8BV;jT&YB+M!S2g$U=6Kq%10cF_?o}~~4)II9i)l}ZH`qd_U72vxM?vgOxx236 zkgPhP3^`d77eP)O;38V^1k=ln_D;VG%h#xAG;QK1xuc68$4N6O6VJ5wM?wPp9lFIJ zWP)8C?zFeprcGIRO0Jx?;h#Hk5Gapt&lBzs1xZ8C{i+9Op1T-?`& zOpVA@#^sPpxFOT3=D94T{2BtLOqNIcOeHH{5SdZcQJE53Fl(=Jw4gk>iJ~u9um$HK zo9a1{-KG9aL;-pcB*A> z@#(VDNMeGaqx5=0g7gc*bzgUPp?_t26wy7SGX0~(7agKbKHy8LDm^Bnb=jofrOG7l zy8U4mCvh&5nQNWQESV}%@1Cx#l9*`Q-H5x~q{p-LU|ea}k*Qf`Zg%uX?w`=QC)!MP z4Ax^Wp}jr-!F9af68BR4eq1cnwu0NN*jCRE#JRc2=6)7W(ECql8?X)8Il6Io+Km}0 zPPB%#4Mz>hhFpAIl7<5zV5(0$<4I(%l(@}N1qvdi0lhPd#}f`St_JcSX=QzR`p}Y0 zbCh!D7Dc{+6OYz|T0@LMEP{tdh6x=PjwxyzO?(qR$ETZU0dy^~{{{OhCkLdI9z(;4 zs#SZQO{29zd`W6;wnf?*Kud|sub<*SrM>JB8c&OVdIZPf**IK%i#)ZdX=DU_2t^f& zbSe(c8?lNn8Bg)mtdrH~DSCLg;N?8%T8&gnXLNOVY@1;|T z)Z-Fmykhgz)I`11(Xok8Skr^;1VZYN@TtQ+(7|s4A9f5aUfeZw@=&>28LXVvdGcWS z)b3NOONJ}M!{x<;i%+R6TC%v@wYa(@^#U(8T#h(!gcvU_ifLNbpsL2LO(VFsw$D-} zFN=W#*etG^s1coJaCG}Z#79};$587o VzGdZ@MSP%nG~x8>5QUa<{|3x2(nA0M literal 0 HcmV?d00001 diff --git a/asdds.py b/asdds.py index c77effa..b565b29 100644 --- a/asdds.py +++ b/asdds.py @@ -1168,6 +1168,857773 @@ def snapshot(self): return snapshot +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + +@contextlib.contextmanager +def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): + r"""A context manager that routes allocations to a given pool. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + device (torch.device or int, optional): selected device. Uses MemPool on + the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + """ + ctx = MemPoolContext(pool) + device_index = ( + torch.cuda.current_device() if device is None else _get_device_index(device) + ) + _cuda_beginAllocateToPool(device_index, pool.id) + try: + yield + finally: + _cuda_endAllocateCurrentStreamToPool(device_index, pool.id) + _cuda_releasePool(device_index, pool.id) + del ctx +# mypy: allow-untyped-defs +r"""This package adds support for device memory management implemented in CUDA.""" + +import collections +import contextlib +import ctypes +import pickle +import sys +import warnings +from inspect import signature +from typing import Any, Literal, Optional, Union +from typing_extensions import deprecated + +import torch +from torch import _C +from torch._utils import _dummy_type +from torch.types import Device + +from . import ( + _get_amdsmi_device_index, + _get_device_index, + _get_nvml_device_index, + _lazy_init, + is_initialized, +) +from ._memory_viz import memory as _memory, segments as _segments + + +__all__ = [ + "caching_allocator_alloc", + "caching_allocator_delete", + "caching_allocator_enable", + "get_per_process_memory_fraction", + "set_per_process_memory_fraction", + "empty_cache", + "memory_stats", + "memory_stats_as_nested_dict", + "reset_accumulated_memory_stats", + "reset_peak_memory_stats", + "reset_max_memory_allocated", + "reset_max_memory_cached", + "host_memory_stats", + "host_memory_stats_as_nested_dict", + "reset_accumulated_host_memory_stats", + "reset_peak_host_memory_stats", + "memory_allocated", + "max_memory_allocated", + "memory_reserved", + "max_memory_reserved", + "memory_cached", + "max_memory_cached", + "memory_snapshot", + "memory_summary", + "list_gpu_processes", + "mem_get_info", + "get_allocator_backend", + "CUDAPluggableAllocator", + "change_current_allocator", + "MemPool", + "MemPoolContext", + "use_mem_pool", +] + + +if not hasattr(torch._C, "_cuda_CUDAAllocator"): + # Define dummy base classes + torch._C.__dict__["_cuda_CUDAAllocator"] = _dummy_type("_cuda_CUDAAllocator") + + +if not hasattr(torch._C, "_MemPool"): + # Define dummy base classes + torch._C.__dict__["_MemPool"] = _dummy_type("_MemPool") + torch._C.__dict__["_MemPoolContext"] = _dummy_type("_MemPoolContext") + torch._C.__dict__["_cuda_beginAllocateToPool"] = _dummy_type( + "_cuda_beginAllocateToPool" + ) + torch._C.__dict__["_cuda_endAllocateCurrentStreamToPool"] = _dummy_type( + "_cuda_endAllocateCurrentStreamToPool" + ) + torch._C.__dict__["_cuda_releasePool"] = _dummy_type("_cuda_releasePool") + +from torch._C import ( # noqa: F401 + _cuda_beginAllocateToPool, + _cuda_CUDAAllocator, + _cuda_endAllocateCurrentStreamToPool, + _cuda_releasePool, + _MemPool, + _MemPoolContext, +) + + +def _host_allocator(): + _lazy_init() + return torch._C._cuda_cudaHostAllocator() + + +@contextlib.contextmanager +def _free_mutex(): + torch._C._cuda_lock_mutex() + try: + yield + finally: + torch._C._cuda_unlock_mutex() + + +def caching_allocator_alloc(size, device: Union[Device, int] = None, stream=None): + r"""Perform a memory allocation using the CUDA memory allocator. + + Memory is allocated for a given device and a stream, this + function is intended to be used for interoperability with other + frameworks. Allocated memory is released through + :func:`~torch.cuda.caching_allocator_delete`. + + Args: + size (int): number of bytes to be allocated. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + stream (torch.cuda.Stream or int, optional): selected stream. If is ``None`` then + the default stream for the selected device is used. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if stream is None: + stream = torch.cuda.current_stream(device) + if isinstance(stream, torch.cuda.streams.Stream): + stream = stream.cuda_stream + if not isinstance(stream, int): + raise TypeError( + "Invalid type for stream argument, must be " + "`torch.cuda.Stream` or `int` representing a pointer " + "to a existing stream" + ) + with torch.cuda.device(device): + return torch._C._cuda_cudaCachingAllocator_raw_alloc(size, stream) + + +def caching_allocator_delete(mem_ptr): + r"""Delete memory allocated using the CUDA memory allocator. + + Memory allocated with :func:`~torch.cuda.caching_allocator_alloc`. + is freed here. The associated device and stream are tracked inside + the allocator. + + Args: + mem_ptr (int): memory address to be freed by the allocator. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr) + + +def caching_allocator_enable(value: bool = True) -> None: + r"""Enable or disable the CUDA memory allocator. On by default.""" + if is_initialized(): + torch._C._cuda_cudaCachingAllocator_enable(value) + + +def set_per_process_memory_fraction( + fraction, device: Union[Device, int] = None +) -> None: + r"""Set memory fraction for a process. + + The fraction is used to limit an caching allocator to allocated memory on a CUDA device. + The allowed value equals the total visible memory multiplied fraction. + If trying to allocate more than the allowed value in a process, will raise an out of + memory error in allocator. + + Args: + fraction(float): Range: 0~1. Allowed memory equals total_memory * fraction. + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + .. note:: + In general, the total available free memory is less than the total capacity. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + if not isinstance(fraction, float): + raise TypeError("Invalid type for fraction argument, must be `float`") + if fraction < 0 or fraction > 1: + raise ValueError(f"Invalid fraction value: {fraction}. Allowed range: 0~1") + + torch._C._cuda_setMemoryFraction(fraction, device) + + +def get_per_process_memory_fraction(device: Union[Device, int] = None) -> float: + r"""Get memory fraction for a process. + + Args: + device (torch.device or int, optional): selected device. If it is + ``None`` the default CUDA device is used. + Returns: + memory fraction, in range 0~1. Allowed memory equals total_memory * fraction. + """ + _lazy_init() + if device is None: + device = torch.cuda.current_device() + device = _get_device_index(device) + return torch._C._cuda_getMemoryFraction(device) + + +def empty_cache() -> None: + r"""Release all unoccupied cached memory currently held by the caching + allocator so that those can be used in other GPU application and visible in + `nvidia-smi`. + + .. note:: + :func:`~torch.cuda.empty_cache` doesn't increase the amount of GPU + memory available for PyTorch. However, it may help reduce fragmentation + of GPU memory in certain cases. See :ref:`cuda-memory-management` for + more details about GPU memory management. + """ + if is_initialized(): + torch._C._cuda_emptyCache() + + +def memory_stats(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of reserved memory. + - ``"active.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of active memory blocks. + - ``"active_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of active memory. + - ``"inactive_split.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + number of inactive, non-releasable memory blocks. + - ``"inactive_split_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + amount of inactive, non-releasable memory. + + For these core statistics, values are broken down as follows. + + Pool type: + + - ``all``: combined statistics across all memory pools. + - ``large_pool``: statistics for the large allocation pool + (as of October 2019, for size >= 1MB allocations). + - ``small_pool``: statistics for the small allocation pool + (as of October 2019, for size < 1MB allocations). + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_alloc_retries"``: number of failed ``cudaMalloc`` calls that + result in a cache flush and retry. + - ``"num_ooms"``: number of out-of-memory errors thrown. + - ``"num_sync_all_streams"``: number of ``synchronize_and_free_events`` calls. + - ``"num_device_alloc"``: number of CUDA allocation calls. This includes both + cuMemMap and cudaMalloc. + - ``"num_device_free"``: number of CUDA free calls. This includes both cuMemUnmap + and cudaFree. + + The caching allocator can be configured via ENV to not split blocks larger than a + defined size (see Memory Management section of the Cuda Semantics documentation). + This helps avoid memory fragmentation but may have a performance + penalty. Additional outputs to assist with tuning and evaluating impact: + + - ``"max_split_size"``: blocks above this size will not be split. + - ``"oversize_allocations.{current,peak,allocated,freed}"``: + number of over-size allocation requests received by the memory allocator. + - ``"oversize_segments.{current,peak,allocated,freed}"``: + number of over-size reserved segments from ``cudaMalloc()``. + + The caching allocator can be configured via ENV to round memory allocations in order + to reduce fragmentation. Sometimes the overhead from rounding can be higher than + the fragmentation it helps reduce. The following stat can be used to check if + rounding adds too much overhead: + + - ``"requested_bytes.{all,large_pool,small_pool}.{current,peak,allocated,freed}"``: + memory requested by client code, compare this with allocated_bytes to check if + allocation rounding adds too much overhead. + + Args: + device (torch.device or int, optional): selected device. Returns + statistics for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + + .. note:: + With :ref:`backend:cudaMallocAsync`, some stats are not + meaningful, and are always reported as zero. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = memory_stats_as_nested_dict(device=device) + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def memory_stats_as_nested_dict(device: Union[Device, int] = None) -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + device = _get_device_index(device, optional=True) + return torch._C._cuda_memoryStats(device) + + +def reset_accumulated_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "accumulated" (historical) stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict, as well as + `"num_alloc_retries"` and `"num_ooms"`. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetAccumulatedMemoryStats(device) + + +def reset_peak_memory_stats(device: Union[Device, int] = None) -> None: + r"""Reset the "peak" stats tracked by the CUDA memory allocator. + + See :func:`~torch.cuda.memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + return torch._C._cuda_resetPeakMemoryStats(device) + + +def host_memory_stats() -> dict[str, Any]: + r"""Return a dictionary of CUDA memory allocator statistics for a given device. + + The return value of this function is a dictionary of statistics, each of + which is a non-negative integer. + + Core statistics: + + - ``"allocated.{current,peak,allocated,freed}"``: + number of allocation requests received by the memory allocator. + - ``"allocated_bytes.{current,peak,allocated,freed}"``: + amount of allocated memory. + - ``"segment.{current,peak,allocated,freed}"``: + number of reserved segments from ``cudaMalloc()``. + - ``"reserved_bytes.{current,peak,allocated,freed}"``: + amount of reserved memory. + + For these core statistics, values are broken down as follows. + + Metric type: + + - ``current``: current value of this metric. + - ``peak``: maximum value of this metric. + - ``allocated``: historical total increase in this metric. + - ``freed``: historical total decrease in this metric. + + In addition to the core statistics, we also provide some simple event + counters: + + - ``"num_host_alloc"``: number of CUDA allocation calls. This includes both + cudaHostAlloc and cudaHostRegister. + - ``"num_host_free"``: number of CUDA free calls. This includes both cudaHostFree + and cudaHostUnregister. + + Finally, we also provide some simple timing counters: + + - ``"host_alloc_time.{total,max,min,count,avg}"``: + timing of allocation requests going through CUDA calls. + - ``"host_free_time.{total,max,min,count,avg}"``: + timing of free requests going through CUDA calls. + + For these timing statistics, values are broken down as follows. + + Metric type: + + - ``total``: total time spent. + - ``max``: maximum value per call. + - ``min``: minimum value per call. + - ``count``: number of times it was called. + - ``avg``: average time per call. + """ + result = [] + + def _recurse_add_to_result(prefix, obj): + if isinstance(obj, dict): + if len(prefix) > 0: + prefix += "." + for k, v in obj.items(): + _recurse_add_to_result(prefix + k, v) + else: + result.append((prefix, obj)) + + stats = host_memory_stats_as_nested_dict() + _recurse_add_to_result("", stats) + result.sort() + + return collections.OrderedDict(result) + + +def host_memory_stats_as_nested_dict() -> dict[str, Any]: + r"""Return the result of :func:`~torch.cuda.host_memory_stats` as a nested dictionary.""" + if not is_initialized(): + return {} + return torch._C._cuda_hostMemoryStats() + + +def reset_accumulated_host_memory_stats() -> None: + r"""Reset the "accumulated" (historical) stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Accumulated stats correspond to + the `"allocated"` and `"freed"` keys in each individual stat dict. + """ + return torch._C._cuda_resetAccumulatedHostMemoryStats() + + +def reset_peak_host_memory_stats() -> None: + r"""Reset the "peak" stats tracked by the host memory allocator. + + See :func:`~torch.cuda.host_memory_stats` for details. Peak stats correspond to the + `"peak"` key in each individual stat dict. + """ + return torch._C._cuda_resetPeakHostMemoryStats() + + +def reset_max_memory_allocated(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory occupied by tensors for a given device. + + See :func:`~torch.cuda.max_memory_allocated` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_allocated now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def reset_max_memory_cached(device: Union[Device, int] = None) -> None: + r"""Reset the starting point in tracking maximum GPU memory managed by the caching allocator for a given device. + + See :func:`~torch.cuda.max_memory_cached` for details. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. warning:: + This function now calls :func:`~torch.cuda.reset_peak_memory_stats`, which resets + /all/ peak memory stats. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + warnings.warn( + "torch.cuda.reset_max_memory_cached now calls torch.cuda.reset_peak_memory_stats, " + "which resets /all/ peak memory stats.", + FutureWarning, + ) + return reset_peak_memory_stats(device=device) + + +def memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory occupied by tensors in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + This is likely less than the amount shown in `nvidia-smi` since some + unused memory can be held by the caching allocator and some context + needs to be created on GPU. See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + return memory_stats(device=device).get("allocated_bytes.all.current", 0) + + +def max_memory_allocated(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory occupied by tensors in bytes for a given device. + + By default, this returns the peak allocated memory since the beginning of + this program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to + reset the starting point in tracking this metric. For example, these two + functions can measure the peak allocated memory usage of each iteration in a + training loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("allocated_bytes.all.peak", 0) + + +def memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the current GPU memory managed by the caching allocator in bytes for a given device. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.current", 0) + + +def max_memory_reserved(device: Union[Device, int] = None) -> int: + r"""Return the maximum GPU memory managed by the caching allocator in bytes for a given device. + + By default, this returns the peak cached memory since the beginning of this + program. :func:`~torch.cuda.reset_peak_memory_stats` can be used to reset + the starting point in tracking this metric. For example, these two functions + can measure the peak cached memory amount of each iteration in a training + loop. + + Args: + device (torch.device or int, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return memory_stats(device=device).get("reserved_bytes.all.peak", 0) + + +@deprecated( + "`torch.cuda.memory_cached` has been renamed to `torch.cuda.memory_reserved`", + category=FutureWarning, +) +def memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.memory_reserved`.""" + return memory_reserved(device=device) + + +@deprecated( + "`torch.cuda.max_memory_cached` has been renamed to `torch.cuda.max_memory_reserved`", + category=FutureWarning, +) +def max_memory_cached(device: Union[Device, int] = None) -> int: + r"""Deprecated; see :func:`~torch.cuda.max_memory_reserved`.""" + return max_memory_reserved(device=device) + + +def memory_snapshot(): + r"""Return a snapshot of the CUDA memory allocator state across all devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + return torch._C._cuda_memorySnapshot()["segments"] + + +def memory_summary(device: Union[Device, int] = None, abbreviated: bool = False) -> str: + r"""Return a human-readable printout of the current memory allocator statistics for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + abbreviated (bool, optional): whether to return an abbreviated summary + (default: False). + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + device = _get_device_index(device, optional=True) + stats = memory_stats(device=device) + + def _format_size(sz, pref_sz): + prefixes = ["B ", "KiB", "MiB", "GiB", "TiB", "PiB"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_sz < 768 * 1024: + break + prefix = new_prefix + sz //= 1024 + pref_sz /= 1024 + return f"{sz:6d} {prefix}" + + def _format_count(cnt, pref_cnt): + prefixes = [" ", "K", "M"] + prefix = prefixes[0] + for new_prefix in prefixes[1:]: + if pref_cnt < 750 * 1000: + break + prefix = new_prefix + cnt //= 1000 + pref_cnt /= 1000 + return f"{cnt:7d} {prefix} " + + metrics_to_display = [ + ("allocated_bytes", "Allocated memory", _format_size), + ("active_bytes", "Active memory", _format_size), + ("requested_bytes", "Requested memory", _format_size), + ("reserved_bytes", "GPU reserved memory", _format_size), + ("inactive_split_bytes", "Non-releasable memory", _format_size), + ("allocation", "Allocations", _format_count), + ("active", "Active allocs", _format_count), + ("segment", "GPU reserved segments", _format_count), + ("inactive_split", "Non-releasable allocs", _format_count), + ] + + lines = [] + lines.append("=" * 75) + lines.append(" {_:16} PyTorch CUDA memory summary, device ID {device:<17d} ") + lines.append("-" * 75) + lines.append( + " {_:9} CUDA OOMs: {num_ooms:<12d} | {_:6} cudaMalloc retries: {num_alloc_retries:<8d} " + ) + lines.append("=" * 75) + lines.append( + " Metric | Cur Usage | Peak Usage | Tot Alloc | Tot Freed " + ) + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + submetrics = [("all", metric_name)] + if not abbreviated: + submetrics.append(("large_pool", " from large pool")) + submetrics.append(("small_pool", " from small pool")) + + current_prefval, peak_prefval, allocated_prefval, freed_prefval = ( + None, + None, + None, + None, + ) + + for submetric_key, submetric_name in submetrics: + prefix = metric_key + "." + submetric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + if current_prefval is None: + current_prefval = current + peak_prefval = peak + allocated_prefval = allocated + freed_prefval = freed + + lines.append( + f" {submetric_name:<21} | {formatter(current, current_prefval)} | {formatter(peak, peak_prefval)} | " + f"{formatter(allocated, allocated_prefval)} | {formatter(freed, freed_prefval)} ", + ) + + metrics_to_display = [ + ("oversize_allocations", "Oversize allocations", _format_count), + ("oversize_segments", "Oversize GPU segments", _format_count), + ] + + for metric_key, metric_name, formatter in metrics_to_display: + lines.append("-" * 75) + + prefix = metric_key + "." + + current = stats[prefix + "current"] + peak = stats[prefix + "peak"] + allocated = stats[prefix + "allocated"] + freed = stats[prefix + "freed"] + + lines.append( + f" {metric_name:<21} | {formatter(current, current)} | {formatter(peak, peak)} | " + f"{formatter(allocated, allocated)} | {formatter(freed, freed)} ", + ) + + lines.append("=" * 75) + + fmt_dict = {"_": "", "device": device} + for k, v in stats.items(): + fmt_dict[k.replace(".", "-")] = v + return "|" + "|\n|".join(lines).format(**fmt_dict) + "|\n" + + +def list_gpu_processes(device: Union[Device, int] = None) -> str: + r"""Return a human-readable printout of the running processes and their GPU memory use for a given device. + + This can be useful to display periodically during training, or when + handling out-of-memory exceptions. + + Args: + device (torch.device or int, optional): selected device. Returns + printout for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default). + """ + if not torch.version.hip: + try: + import pynvml # type: ignore[import] + except ModuleNotFoundError: + return "pynvml module not found, please install pynvml" + from pynvml import NVMLError_DriverNotLoaded + + try: + pynvml.nvmlInit() + except NVMLError_DriverNotLoaded: + return "cuda driver can't be loaded, is cuda enabled?" + + device = _get_nvml_device_index(device) + handle = pynvml.nvmlDeviceGetHandleByIndex(device) + procs = pynvml.nvmlDeviceGetComputeRunningProcesses(handle) + else: + try: + import amdsmi # type: ignore[import] + except ModuleNotFoundError: + return "amdsmi module not found, please install amdsmi" + try: + amdsmi.amdsmi_init() # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi driver can't be loaded, is ROCm installed?" + + device = _get_amdsmi_device_index(device) + + try: + handle = amdsmi.amdsmi_get_processor_handles()[device] # type: ignore[attr-defined] + procs = amdsmi.amdsmi_get_gpu_process_list(handle) # type: ignore[attr-defined] + except amdsmi.AmdSmiException: # type: ignore[attr-defined] + return "amdsmi cannot list processes from other users" + + lines = [] + lines.append(f"GPU:{device}") + if len(procs) == 0: + lines.append("no processes are running") + for p in procs: + if not torch.version.hip: + mem = p.usedGpuMemory / (1024 * 1024) + pid = p.pid + else: + try: + proc_info = amdsmi.amdsmi_get_gpu_process_info(handle, p) # type: ignore[possibly-undefined] + except AttributeError: + # https://github.com/ROCm/amdsmi/commit/c551c3caedbd903ba828e7fdffa5b56d475a15e7 + # is a BC-breaking change that removes amdsmi_get_gpu_process_info API from amdsmi + proc_info = p + mem = proc_info["memory_usage"]["vram_mem"] / (1024 * 1024) + pid = proc_info["pid"] + lines.append(f"process {pid:>10d} uses {mem:>12.3f} MB GPU memory") + return "\n".join(lines) + + +def mem_get_info(device: Union[Device, int] = None) -> tuple[int, int]: + r"""Return the global free and total GPU memory for a given device using cudaMemGetInfo. + + Args: + device (torch.device or int or str, optional): selected device. Returns + statistic for the current device, given by :func:`~torch.cuda.current_device`, + if :attr:`device` is ``None`` (default) or if the device index is not specified. + + .. note:: + See :ref:`cuda-memory-management` for more + details about GPU memory management. + """ + if device is None: + device = torch.cuda.current_device() + # optional=True allows `device = torch.device('cuda')` for which device.index is None + device = _get_device_index(device, optional=True) + return torch.cuda.cudart().cudaMemGetInfo(device) + + +def _record_memory_history_legacy( + enabled: bool, + record_context=True, + trace_alloc_max_entries=1, + trace_alloc_record_context=False, + device: Union[Device, int] = None, + record_context_cpp=False, + clear_history=False, +): + _C._cuda_record_memory_history_legacy( + enabled, + record_context, + trace_alloc_max_entries, + trace_alloc_record_context, + record_context_cpp, + clear_history, + ) + + +def _record_memory_history( + enabled: Literal[None, "state", "all"] = "all", *args, **kwargs +) -> None: + """Enable recording of stack traces associated with memory + allocations, so you can tell what allocated any piece of memory in + :func:`torch.cuda.memory._snapshot()`. + + In addition too keeping stack traces with each current allocation and free, + this will also enable recording of a history of all alloc/free events. + + Use :func:`torch.cuda.memory._snapshot()` to retrieve this information, + and the tools in `_memory_viz.py` to visualize snapshots. + + The Python trace collection is fast (2us per trace), so you may consider + enabling this on production jobs if you anticipate ever having to debug + memory issues. + + C++ trace collection is also fast (~50ns/frame), which for many typical programs + works out to ~2us per trace, but can vary depending on stack depth. + + Args: + enabled (Literal[None, "state", "all"], optional): + `None`, disable recording memory history. + `"state"`, keep information for currenly allocated memory. + `"all"`, additionally keep a history of all alloc/free calls. + Defaults to "all". + context (Literal[None, "state", "alloc", "all"], optional): + `None`, Do not record any tracebacks. + `"state"`, Record tracebacks for currently allocated memory. + `"alloc"`, additionally keep tracebacks for alloc calls. + `"all"`, additionally keep tracebacks for free calls. + Defaults to "all". + stacks (Literal["python", "all"], optional): + `"python"`, include Python, TorchScript, and inductor frames in tracebacks + `"all"`, additionally include C++ frames + Defaults to "all". + max_entries (int, optional): Keep a maximum of `max_entries` + alloc/free events in the recorded history recorded. + """ + if isinstance(enabled, bool): + return _record_memory_history_legacy(enabled, *args, **kwargs) + else: + return _record_memory_history_impl(enabled, *args, **kwargs) + + +def _record_memory_history_impl( + enabled: Optional[str] = "all", + context: Optional[str] = "all", + stacks: str = "all", + max_entries: int = sys.maxsize, + device: Union[Device, int] = None, + clear_history: bool = False, +): + _C._cuda_record_memory_history(enabled, context, stacks, max_entries, clear_history) + + +_record_memory_history.__signature__ = signature(_record_memory_history_impl) # type: ignore[attr-defined] + + +def _snapshot(device: Union[Device, int] = None): + """Save a snapshot of CUDA memory state at the time it was called. + + The state is represented as a dictionary with the following structure. + + .. code-block:: python + + class Snapshot(TypedDict): + segments : List[Segment] + device_traces: List[List[TraceEntry]] + + class Segment(TypedDict): + # Segments are memory returned from a cudaMalloc call. + # The size of reserved memory is the sum of all Segments. + # Segments are cached and reused for future allocations. + # If the reuse is smaller than the segment, the segment + # is split into more then one Block. + # empty_cache() frees Segments that are entirely inactive. + address: int + total_size: int # cudaMalloc'd size of segment + stream: int + segment_type: Literal['small', 'large'] # 'large' (>1MB) + allocated_size: int # size of memory in use + active_size: int # size of memory in use or in active_awaiting_free state + blocks : List[Block] + + class Block(TypedDict): + # A piece of memory returned from the allocator, or + # current cached but inactive. + size: int + requested_size: int # size requested during malloc, may be smaller than + # size due to rounding + address: int + state: Literal['active_allocated', # used by a tensor + 'active_awaiting_free', # waiting for another stream to finish using + # this, then it will become free + 'inactive',] # free for reuse + frames: List[Frame] # stack trace from where the allocation occurred + + class Frame(TypedDict): + filename: str + line: int + name: str + + class TraceEntry(TypedDict): + # When `torch.cuda.memory._record_memory_history()` is enabled, + # the snapshot will contain TraceEntry objects that record each + # action the allocator took. + action: Literal[ + 'alloc' # memory allocated + 'free_requested', # the allocated received a call to free memory + 'free_completed', # the memory that was requested to be freed is now + # able to be used in future allocation calls + 'segment_alloc', # the caching allocator ask cudaMalloc for more memory + # and added it as a segment in its cache + 'segment_free', # the caching allocator called cudaFree to return memory + # to cuda possibly trying free up memory to + # allocate more segments or because empty_caches was called + 'oom', # the allocator threw an OOM exception. 'size' is + # the requested number of bytes that did not succeed + 'snapshot' # the allocator generated a memory snapshot + # useful to coorelate a previously taken + # snapshot with this trace + ] + addr: int # not present for OOM + frames: List[Frame] + size: int + stream: int + device_free: int # only present for OOM, the amount of + # memory cuda still reports to be free + + Returns: + The Snapshot dictionary object + """ + return _C._cuda_memorySnapshot() + + +def _dump_snapshot(filename="dump_snapshot.pickle"): + """ + Save a pickled version of the `torch.memory._snapshot()` dictionary to a file. + + This file can be opened by the interactive snapshot viewer at pytorch.org/memory_viz + + Args: + filename (str, optional): Name of the file to create. Defaults to "dump_snapshot.pickle". + """ + s = _snapshot() + with open(filename, "wb") as f: + pickle.dump(s, f) + + +def _save_segment_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_segments(snapshot)) + + +def _save_memory_usage(filename="output.svg", snapshot=None): + if snapshot is None: + snapshot = _snapshot() + with open(filename, "w") as f: + f.write(_memory(snapshot)) + + +def _set_allocator_settings(env: str): + return torch._C._cuda_cudaCachingAllocator_set_allocator_settings(env) + + +def get_allocator_backend() -> str: + r"""Return a string describing the active allocator backend as set by + ``PYTORCH_CUDA_ALLOC_CONF``. Currently available backends are + ``native`` (PyTorch's native caching allocator) and `cudaMallocAsync`` + (CUDA's built-in asynchronous allocator). + + .. note:: + See :ref:`cuda-memory-management` for details on choosing the allocator backend. + """ + return torch._C._cuda_getAllocatorBackend() + + +class _CUDAAllocator: + r"""Wrapper over internal CUDA memory allocators.""" + + def __init__(self, allocator: torch._C._cuda_CUDAAllocator): + self._allocator = allocator + + def allocator(self): + return self._allocator + + +class CUDAPluggableAllocator(_CUDAAllocator): + r"""CUDA memory allocator loaded from a so file.""" + + def __init__(self, path_to_so_file: str, alloc_fn_name: str, free_fn_name: str): + r"""Memory allocators are compiled in .so files and loaded dynamically using ctypes. + + To change the active allocator use the :func:`torch.memory.cuda.change_current_allocator` function. + + Args: + path_to_so_file(str): Path in the filesystem to the `.so` file containing + the allocator functions + alloc_fn_name(str): Name of the function to perform the memory allocation + in the so file. The signature must be: + void* alloc_fn_name(ssize_t size, int device, cudaStream_t stream); + free_fn_name(str): Name of the function to perform the memory release + in the so file. The signature must be: + void free_fn_name(void* ptr, size_t size, cudaStream_t stream); + + .. warning:: + This is currently supported only in unix OSs + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + allocator = ctypes.CDLL(path_to_so_file) + alloc_fn = ctypes.cast(getattr(allocator, alloc_fn_name), ctypes.c_void_p).value + free_fn = ctypes.cast(getattr(allocator, free_fn_name), ctypes.c_void_p).value + assert alloc_fn is not None + assert free_fn is not None + self._allocator = torch._C._cuda_customAllocator(alloc_fn, free_fn) + + +def change_current_allocator(allocator: _CUDAAllocator) -> None: + r"""Change the currently used memory allocator to be the one provided. + + If the current allocator has already been used/initialized, this function will error. + + + Args: + allocator (torch.cuda.memory._CUDAAllocator): allocator to be set as the active one. + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + torch._C._cuda_changeCurrentAllocator(allocator.allocator()) + + +def _get_current_allocator() -> _CUDAAllocator: + r"""Return the allocator being currently used. + + .. note:: + See :ref:`cuda-memory-management` for details on creating and using a custom allocator + """ + return _CUDAAllocator(torch._C._cuda_getAllocator()) + + +class MemPoolContext(_MemPoolContext): + r"""MemPoolContext holds the currently active pool and stashes the previous + pool. On deletion it makes the previous pool active. + + Args: + pool(torch.cuda.MemPool): a MemPool object to be made active so that + allocations route to this pool. + + """ + + def __init__(self, pool: _MemPool): + super().__init__(pool) + + @staticmethod + def active_pool() -> Optional[_MemPool]: + r"""Returns the active MemPool""" + return _MemPoolContext.active_pool() + + +class MemPool(_MemPool): + r"""MemPool represents a pool of memory in a caching allocator. Currently, + it's just the ID of the pool object maintained in the CUDACachingAllocator. + + Args: + allocator(torch._C._cuda_CUDAAllocator, optional): a + torch._C._cuda_CUDAAllocator object that can be used to + define how memory gets allocated in the pool. If :attr:`allocator` + is ``None`` (default), memory allocation follows the default/ + current configuration of the CUDACachingAllocator. + + """ + + def __init__(self, allocator: Optional[_cuda_CUDAAllocator] = None): + super().__init__(allocator, True) + + @property + def id(self) -> tuple[int, int]: + r"""Returns the ID of this pool as a tuple of two ints.""" + return super().id + + @property + def allocator(self) -> Optional[_cuda_CUDAAllocator]: + r"""Returns the allocator this MemPool routes allocations to.""" + return super().allocator + + def use_count(self) -> int: + r"""Returns the reference count of this pool.""" + return super().use_count() + + def snapshot(self): + r"""Return a snapshot of the CUDA memory allocator pool state across all + devices. + + Interpreting the output of this function requires familiarity with the + memory allocator internals. + + .. note:: + See :ref:`cuda-memory-management` for more details about GPU memory + management. + """ + try: + ctx = MemPoolContext(self) + snapshot = torch.cuda.memory_snapshot() + finally: + del ctx + return snapshot + + @contextlib.contextmanager def use_mem_pool(pool: MemPool, device: Union[Device, int] = None): r"""A context manager that routes allocations to a given pool.

biI7^@+$1YH z;goM()H$YSo)NosQOCPLi!bVAY8~lyo!rR~=ZNFc_O$d=Juc;>R)@cu3u1bkD&mAC zXWp8TjrdZIr`(ZY#=GP=`OZC}9J$JhbVBGhT{$aoe%)>YqLJ6$h!&ruybHxiT8RntW|t*$qU?A8J|@tMK8q7rK#)XxFg1F2gL;U|~u$EV%a~A)9e&GNA4Ickf;FBQU z0}$_j4EQ>{{nNmsf$saC4t@p?|9bE^umU^`d;#A5Kf&k0^T8uPKlmX${42l?a2s$v zJo}G;{QSQZoCf|F+zH6P|69S!zze`;a27Zd=$`*;;N33=F9F-Z&*0au0`CWJ2QBbu zAU*%@;Mso*egm!myMTQ89|d$jU>Mv7+y&ej+y-0$&we4Ofu-R7;8XDF?*?xNdw}lt zkAU;R?ZI{M>E8oy1oFkd9_W66;{553z<<)$>hITqSAaZ2uYu-rX=@bq$htiF-g82m zEqwzSF1(BX9;hk~n8p9X?`(4dffXY6d0x>eE9K$e#y6{K4+aT#MG z$=XKwk5dbWAA#LuYXPBJ%xRz~Kywj7kHi-j%lj)%M$OZ&CEaBdSa3elLsxn7q`tj$ zlbryCKB{vCv{qE*vg_0(5#ch(sysEQzIXogM!zSdqT=WOvY27K-JbFzK~~}u+$&c^ zZqbb<@Cvf4Aoogj*)H$LFvM?#BU8XFG51&{N2{pcl*0mohpbU$ReO5t;-+E&*ey_9 zZQoA-r{JB<^#<5;j!xs|ux$Bg9_lJ_LDePGSMIw7XKt)^OJ~(PJ-Gzu&e+0|cANW> zmJ|Mmd;4mXd6P0bY~hs`6oJnz8CgNP&>XV(-Hps3`KN+ql$C&MijbOhk z+4gPX+QxbW_I%6C&W0nf-S@cBceS>BukUr6FO#z^z7kWZeo~#Ab~!G%x5p64+-c(K zP?xeGPmE1+slu) z0r~fDgGr#ce|H7{4gdd9AU*$cf%N?60^J9=4^X_nA0rd^BzP-$I%tBk!LdNT03|Q@ zckpTO&tMju0wgPVFX(`E;7;I^$OE1Mo(!G@ronmOcHlbX0*dc93))~8I34^Dd4T-* ze;Zs5o(didP66@-_!jVH&;;ASI&dtwKe#)%8_?K%6lh85BQ-^5cF?ayH89KLI(YK0 zSb0y@OG4Il@@iDd=~ z+5@UWTJJ;}+LzwWf++7(DmprGfnfs;UO{d3NF|EyvZ%1OM_W36&MoNGd#3SCX9!F^rMeN4yw+D30UiXd~q zsaC3dBE<*aQ~4ZoU0`3i9|`NNbPm)C7u_@gS$-=rSL3X~INQZfxUMPBcxo0w z1+#p;Eb1h3$BARi1z+~-WqNmgCa&$m1%~CzeI9$yE&FwKa>0`F*XNuW())!5JFSz+ zNw+j}?Trmgv&c2Omke@_6jX^FelLkhzWD3e8+m#+bX*TF#yb#c&DY0A3zplx%r%|T z9WsbyNBRy7A^y~?Jm3yZn4UJ}She=i(*|j(E4hBUYSpSrcy7P!dD$197e!9C_S+Uc zmPVe88L%5R_vW%=qfClBg}rSRJDkc)w@dAFX_ng0yrfoV_v-!-Kuq~3JRU)7ehdg=42%C;G25{nsx4wVhwxeXCVk z>s%6gwsT4BSd+0aHoiK-MH1gY6L&;vy+0zMxgPibu+|+W)`I_&3X*O)H$kYHMOz~V z0oU$Z;pVuSlOYePwL!&CsJ5rv!3A=#{d-zSQ@L9$sjP`j5!Gg238zj)S;fy`@mEts zFrN|MsaW8}Cc%!9e&( z$2-=>Vw%$l`Sz@ZZja(0dVgBIKIyAH!_GCUdu9b99+j=7v`2+&*4s({i>oF5#~+7u*fU9bFw)FCy&MV z?noST!jg@qMatqPR3>$Gp8P*gb5l2r>}LJTCV3>nf#wD`3A-9bv-HZ4*=KNFI0^b0 zRtJ^S@e#MKq*n`LFu!IZx5KGky=X&IX;>t5G8)nJOIJ=UKBvs|h3=MCRQmsIOd#ya zjsJgq;Qv1apZ`Jd0x$;T2k=hdjzGNsJHhk8FgO*g153c0;qe~@qK$u*zhwjPm*81I zJiqMwYhWq39gt7JFTn3B#@~~{cD7 zKagF(9l-nH@Baq87|0*MMsOXx{X4*$!C!zUgY95B_#Qm|KY@P)uLVyB7lH?X`-AVp z=f4G91O~uP@DT7j`22T+Hv#$nm#x6p;qSi&-U8kXUI|VE-+-TgId~j6A3O@wz|lZs za2CJM1Sf+Z@&6Zrdq#vzSeex{EqatTW}bLr-{gA?$Gd}j&vG-$OQbt7k8(+W7~!asBaE5nLM3#BMe1;2vC%BHXTY| z#IDfq!t|PG56|L`T~lmC&g`H--5YyoP#FHMMs2nC&S}qV(pwgSFeR*2P6#yRRqv{- zf16IYBoKGrQ)ZllQ*)F1CdTXfvC;e+;w5|oZU6};wqw?ov<-Q?&wq(m^Svk`|B;4H0L~K>lW+P#zvcY z^z;^+a`|4)v}vMkLy9dpdc1V$S2dmGZFvpIgVdoK}$kQWcN zYWUZM<6qIYOkv^nRW!}SEJiQA#=z(O^u|m`f1t0?*NfsF9H|CBYW3-1SZ z7@w?ekxAogLf?0Mk9RI{W(C25oIGU?4Kt2AE+tbmMiu>KXQ#c;yQ}-@T~PEzDT1t8qKI}!Ck7Y zvCJ1ad0tL5oMaRFFbJj7obu1hGFnX5s=mojT0L2G;zre0ae`$ha3)P*B6|_l1BrsE zA#G9*^Es0xx|MX@r1LtLBAwqVF*p3_tk75bSk>d}%wzk%Vqax`B(ph_GI`dOTJ_2# z)W2P6!d%WI3r}(>IVy_@A$7w|%S`>e+9k`dxMcNCQt^`X$#>0qg}21;2v7|0VbbAfEpMa5T`}{?CA?f|J0*z(c{O;qjjio(3)j>%pDD zhv4xq0lUF{!MEY#{}!AF?hWMYPkntGzWW6KKLM-;t3V(4Jl}l)Xe{;N)EL^^sn)jO z7Js@qzPoQ~j_6dTWn*^rUAWwn^^!1^Hdo?H`n}Nw)@Gt)>gjMJCZsWMCw*L5jh<#H zb`D=attq=U-ICvWzd5PZ0c$T8zLoH@{R;k4ddkCAOnD|cKXF^U91Bt01VL*N-zrmj zSP#_A@-Q5Vyl$I^0dfbVFT>BmtqJN?IzI@+5zlWv_kY~W{=lq z!?DNuTey=#df|?_<$O(UkVl%xoNAY?T{YRE4`qK5JuIkqkbg4|=BCW`F^sw_?1Mpj zkXMg&I{4i5f~?7^g;{9Q!MuU*d+Y0q+JoF18>!BdkaNH zYjD=@2kb%-v)z_9%T6=lUR~&S&BqzJa@vo7?##BMNkq6m3?x~yao$YYo(T7ThQ}w$^+2u#xVUc* znHiBc#&f7-+>mKo4ePDH8c&9joyme(2a}^?n$x3H{g`9+;yh%uQ;`}&S5+B?I5)yZ zQ-cIh2`NcCg!l~6rSC(jok@5*?2(P3sj2ik8;!)NmX#9Hm8X%$go}={>q!&j3rL@b zw)=y|hI)+XUecJxIPpb?=4zD?@Fmxjb*7{BZ1QEPG1+gOfnC5!oM$qBtFxJ9Q)Sw{ zZgp8rOttM_Btvf6;{`@AX|xYykauO)vgCAf|Af^&(Pml`h#og{Pz3pd4}4xw=>ILQ zeqWNDfL3u^75h8Yfh0Gth5uVys`8hzk!bUEM>qaW-Gq^nL~B~xbkvkAy*%8c=NQgx}fKoM9gD(8mTLrXKw(IMS@Y=lI)>J147 zxe+`xHbUq)-O?Gb>0_ zoAO?E2#u%BKR<&L^=uNZzDJqb%q%*B-6%yBigYFk&KpI=7fj~lY8C5oe&qP!kv_@x z6_WA5)Yx$UsjF9EF@yw=|JlR&e@*?kk%LuY3&=XnxAmYni}~>$yqmINsg1ND_s50{ zEFtgZlXylAyOh#*;qJbD5G~r$+5J~mYpYkSIX+kkr5k!uTr5aAR^rq6dx^WOJ$=Ip zGeol9M)RN>^g8gSAqe(K(`ahBE-jksUtPa_GlPl&t|uxfV$aOXRHuLC%2Cek*}*!2 zkXB0hw9;Q#$^QgCTsgFM?dqZ9hMKM6!Qqov9XHrKY0XKk6Gw(eMw)8}*PbwZ{E2Iu ztJk(p%oPyShU-xWjzK?q6w|z}K}}71n@;dj>kBj~=wgrncB`wV+C--r9G_c(`lw9( zm}rDlY+fs574gBZGv*cjA3tHl>rAELGp7`VMeM+QW6RF5fcik|Xv*#CP=)rJ{$D)* z74V$m|6}|=`Td&#=Ypewc>a%qzXuI)E;t(89o!B42!8)#;I-g6U=LUir2l^q_z=AR z2f+uxW5Bs!7w7|DLI!Xx_zUnn&;iq68#o615?R16fbIf3A4ooM2k=37|L23V!5Z*$ zc>Ava`2vug|DC~g@b=#a9|O+>j|cMi|5JGTe*v!ngWz`HpW*8-11rFCAU^=t!_!{_ z-UFoL|7)OFfxm@+|15YXcpg{}{u}=Nbzmou-M;$tdEkM$12~~qv~!(>CzbVX@Y$ZO z_2jXS12i0Raegcff>4WdvlNK{`f%BGc_7V5m@@EU&ar1z!JEXaJNYiJn7)TpVvc3M zml*Yo6o4EQid3F1L&2I)W4JRh1C(4QN!5&grf&09t=ZY@Te&5)Bh2EN_f|1Q-=;C~ zYBSXtF;9lE?V|lnGLf&GjvK6}b5#N_PYfRv%?(XI@D&mwK0JG(&}wY;uLl_P9Nn@&UOo88qidH6gsq)S03x`J*(wxz9(HwkUlM_N3{d-`~LXz%=! zH^h$%v)2-*syFpw82F?mu}`uB4Ux9SsvuWkli5eCwd_36TA0bttz-l1?(Aa5NqbbA zc)q!nS2BBS0GKjM?ON7St(5R*wRS2Yyz=y}PtWdgF<(o$3S|zWrQDp=T%dZP61j@a zI~$i&TN6_=bM8O}!zL}W4p9g!U}o4p6(|dl#~4?8xR4J#-VktPZyNBiKeg9`zH0>7@Xh1lO;b*+wzYj$anw01On zm}IE)FouR^5XmJY4$csX!A;bue$0khokjqS%bL39g}`H{N3~3jb~-$3^6VvXnkyso zI9Bj>W`e@_b`RXNOrgsKi}eP}Y;>}z`ZXGGTgiX5@)sHkQ`pUBU~hHuF^Q2*r^RbS^8E879%+GqOXUK#NKTGe!A9Ll4Y#>O9suaT%Xi zOOXYq=^&Tg+Ug7y@2_qPmqvDzMD{tEG>L$4o;35ze66QzLGa6t%e=EoC<}~BE$P#O zXxX>jJbSxvc0oqFaE9=;nLyr2TFF9#Y&R?2Qyc8KaZQ=45!hvIn)@7KMMc$Ge2{J( znX#(Y*n6{@z3SQ0el|6A%2+aKZcjT9(*0V2mB*=E_e=jHW1%`4)_f`{jb5R zz$?Lv!0})?xF7f_y#2M{i{K03CE(9M6Pya}5AFx<3%&^7e-(HYxD4zDj|1le$ppTO z4B$K9+u&-jAFKtE3%nOx33L}=7uW(G2>uj20NfUQ0=a)%JqB{ujWDr>03cjggRn^`3%P^wt(%uaGiFJU=}IV7@0J4rJ9}`Q=OWbWG&ae!_A_ z9VbT4LyjkB3X#}R^$xc30>518WX#pOb66H4L+m6CKGwv=pnZnYfuZ4GOehaCB(G#E zp})Nuw%;U8Lll^45L-wP=HG~oD^%c6TW7DEsX$i))d%5gBmO3h2ua6Xoq&RSkK7J) z+6@gy&3kt{PvgF;$DDXGNU-T&NeKNVf*64j71qvLrTSKT zzG18N3l!DiIb;gq)?rXmh;s=PvK_(}b*9m%;r?s{QF6ZL?DP*7u5mhrXW8j%kD??* z)yiv*!aBOe+c8Z(M^|9usLq}qb5@fRKS5-ux-js$>4=EAlo_5=mTKVOs zlbRBoI4SJbHFd*M9J$VpjnAxLMmA&c^EL)FNkhXuJg9+jeX?%lfUce#@3^8yXw7owI3Rpe{cmIF+4|nbAJt zn%gbf&n0@@$Oq@1ul>vS(VmCwR5Dte;Vt(S}?PyMP4d6 zqA@P6jX_gQ3scfatjYHGEmIpu(!(ph#iGfW;8-D0#>uxF*wUEyhibz6Me7dJ4<==- z>*HcIca7?ueWRGK#rN0g<=kRVY9D#to4o#^U{`P+dIK=#+6!?oJUcN}gWnr*_N2Vm z2Xo!7n}Wr@&aUq`UR51+IR+s|Qs{8wxO3lc^}M~MYLyg$;!joW^M#%{{EQUyLlToC zM30&-96%xHq3vT6se&B ze=(e)Nj)X6&xj#tThk8}n~mxr+sJcm%Zyv@d2Dks=fZ}YZ1Z0D&L~Oo(aVm)F7}D| zb+%frs<0~gE|c|d#j@=BBzhKY32aDXh@}v}5GT!SW9fFCzLo5IL2{($QFLsiagf=V zqzt!4E^AA?1?RG0>L4}m7E6kH?0#CJb;#u=fn=wRgryT97!ArGfhsjTqa9nQA<4=S4JfpF#G$Y zJS#KfVY;0ri*MxeiIs@IJ)cyLpZfK8$HJz?8b<4GW;L@zL+JH#87?LEBLe2+Q>+*v z(p&pEC94cFE}MgIih6aaMsF$did=3+yFhOi1$AT_JUKvpMWgyFCTyh!3wY( z^nq`~-@gj%0E+YXF?jipf~&y$!9K7Bh~JkF!25yE!Pj32-UHqQi5nPBcS9ZtiOO(O=!#n%K@C(dF3!l1}#VM+Q;l z_?XOjOx)^*au_Ek^EOnebQuq~#;Zd!sY59)^0QFzV;9zq{fcjLM*AtH37>=+&L0=? zJu)SEXv_w$nlmX$);SZM>@lM}QhUEtP^jm<5&E{qBeh?29a~X8`V{DUJqlZ1x*tZS zCZluNEIx-#_DHx(QJH?SWeUhs>+xs^ViPeT-PP3G;T*LCAaSHp@!V6AwTnxPQ)3n; z4ii;PM~lpov3f}>8Sk&mM`j8vQae z*S(ZNe`Sovirs{#LL0EqGp}YX}E=%{RO*%MI8AQRYA+NqCes%QM9)wN5)2HLr{jyTrDzu`Q=Ra3=F6M zvB^}kNyH^?TjZCa>^NNS=2I}FTUm*f&~F}MdqR+>{qb#B)nrz<2uXad&C@EX+kWY` z`UjE@8^~q1kOU-{<}eE(`E@e^dX?EOo24X=B-`3TIpsK+roEiuMaE{37+fULswy65 zls6&PVFEdxEQittYATm^y5?xIXri){F0ww+v<%%Y(FW}r+9Nt9X-G!Yusq1KLT0r_ z_#@#gs(ixoj^BkdpdRcOGz<&uGJ(Yk`NqRR^Emf@ekF(0artUH92??t=AQO&I%&IB zjA}CPGSQd))z}8LPfg3Mg3v*>@7PwvH1G||rkcH=m#oE^^E2r)Iy?Lw2bBz%QWLm{ ziK16ziJEPeS!RY<3Ds?r%zo_Zv`)|{PeAnhLW!J!53u)~ufNPx2m`~)8JF{PW?HI- z2*aRaOh84KcJ>r9&(DQ;)W9-mJ637pY{+MP%iI&%zGj zvhxit*IOTJ7acMX(J`sT&DJ*>1bvd4#ONa#$`J8!UaXx-?Gs#LMc6D&xiHZ<2z-8q zIbKmA?A4iBnaM>_wVLDd9VjC)4DK`tE9Kv4%r4vEy21#BsB&;?yw>!zqGDRs8r}`5 zo+Ce;aXU;K>D5+cs3?4Pdmr}z!?4e%@kqyweI(x_+Pz$YnAx84lKzC`PJ}c*YYWo? z)rGs^;1A!+)Nz<}LcSO$%5#~xi1}8D*7!^kYvO!qf+B>f47qM)h>M*y)~F89w0TJd zOx?&K9#c1@vTIB&t6Pk`v}nz2mAPoXv37@&vD74K&Kovv*-|U0e$sx*ZY}DR8)OfP1qZ+-K(PWOJ6H+s4!(y>;A-#|@KUfJJQ~PO;Arp@WCC9RZv%f0bcbLw zkjy~71Ac>y;5*<`;GN)=;AvnQoC{6__XUzA{138%uYfOuFM(^p>w)YDP6Q`_wczgH z$H)!-35QK)wj~gMC1A;Gw__&wLMH;$f!@g%c>R^z87zYOby~ZZVvq#e)O3 zyl~5bk<*OUr;vDdQg*A233#WA10mMM{LhWG#$8I6@alzJa(SroX15%3?Lk1%drQ=Y z#(49RIb(IGvK41RU6UQUW^{(H>;eViHec~!_)o_bc2{O;upJ^N`xrG~*kxPcEZ%AI z<6Z4y%G|D=&B6k<#Z2ORZd4o;tkIiVO?M9_+fQiX%=q|Kt9{hczBYeSjmGZ2c>Ol1 zvE4gvAm^W$YiM__l_c^0$n1BX%fbzG-RitmSmb3{%Vld-(dc#6prn5mDYop5(z41b z9;oagBHT4xA{CU_QX8P^U=uO3slZ#E3aUgM#}F}gj>clpNXALp zmr%{|OUyc@4N7N1Zt9g($0m3_k+UTU;hLf2HfW@(2Mby8>Rv;ylfQ@yTP-t~?6Cz2{9AzwokT zvl+TJ-zi;RFVvtoI8dHMyRbc_ue&W}sj|=JX+btg%sitKijTZ^r}2Mf_m5+%&=#7V zR!pWDhiGoGWZh@ITieELy1zG0zcv5&)_;865Y=F&-_kkRp;M2QMJF8(9&feSu<#`Y z3i0ZM9>#wvzHN+k?DUQNl?_39D2KC8ZSV4Sn)xoMa?S5R=5e?Qod4D<9&1|*E%E)( zxRLnPMsw`y4Gr7nEP7>(a3`c!&Ynh@M9H@sDj}V{8;ij`4Og)+VsGK{MS|s!2p2fm zF-uJv^0I{S+x3%#i3(R|mM4VxRm6TODrj4ErMkyr$k6Fm;C7l z1n~=JvXcK?(SkZCeG>ih;9N)St{FH?VdkW`is-Ph?r~dF)1Z6jN6t4@!c?ipcuUdt zT+pcf*hK7T^5>@J2>1>+x1Y<{8k1ppgiiIbMIBQY4HIrUFtc#IfQcT2Sbc0br51W2 zR~E{n7rNHVC?PKv$~di}&p4BQMA46TC2}7Q`p~U#;G*^IKKi_&Hat^*!@C4ps%9&# zRH>%!@^{3?TDofZP;!uRdvPt7s+R3E&kd~9jD@4`El6ldQc=Sxh93<;bxJx zYWXCJ-k_nhJBc`GDAPezqZbXE(Wd`j#a?`sbjtAm(f|LO;rI7|QLqer8y;Ua0GEM- zpaaDF{}kT-eP9?o6f6Pv1|Nfue+PIocoC5Qzth37;Fs|29{_&`E&)wY1K);6{{XlG z{59AN&H`tGDu})SUktCl9jpVV0P*kF!=L{Md=>N{vgfPR#rk-RQ7AZ zgkva2S?Vqlj&J!=igMEMA2KS`H#sd3%2Eync3DqSm-rK*ez0%J$o@4O&qRV6PVh*8 z*fuUBAR|v)B)teR61KR7ObsNe9Kq)L|6}hx;OjoC`~M38O3Emsi~ubc8L*tl^2io( zg5$)ANgNy7NeCuL*V2`&Ske{uN_HH_VeiovN+|^jWtQEvWu|3AnWc=V(7r8T8pS z6RKTkmS}LBDQ-b!*wmF`C^2@+_tW~zqYcQ0$+`W$UE~KPo`L`Wo0=EOJ#|2Tw10hD zHc;}x6IJBGIeg@KD+J+NKR!#LbOxcMQ!E`F$QB`XA)O;Am?;`sUz`v`SSXswV%-|t z2qZ?Y4Qxvz%~8A}n*O8B3f^Wb;V{+`-l3xrQC3;RRPd||aV>}+oXl)yniXqub_$nO zmD(t&)Iw=>(n;miBu;%TQkw8(&seb3WH$fUEx$4wJ z5+$6*M?NfcCX32&Y#~M|s{-BZT(Jz9y{l<%R8NPNQloZbA6HDs#Y#Uzb{J!?oHA53 z^vQ{OAG~_Dv=fg*YgOfoLQTo)6Xh_=PAtG;CWSS>h8QQ^T6{7U>%0+bieK3A)ZCR& zmMV`d-yd$3R`BTd zA1O?ObFBTdiu(C2N&2z#!b8SJc=rIQbZ6Rs)ievs&bx-X28K0VoFaI0_~3Bcb`wY& z1LLihwx@je%y%kf-lx?8MO^v8<0RY@%=a66Wu(w@E<7L6<~#z;GeYU%p=EAwS|)IP zJVlGXl0+g5vK>0>$m;%_N33r1H5G=bUD>cbC38X-vs~{*`f}ngZE-Va43Z$5w28Qg9`y;3f`oR~$Al-fLHks#$vH;2Lv7F>CD5jC)jCF}hrO$7Y%F!uc zT1>l?>}@rTB{Dg!EG(c)TF`vKVMS9-*`cSK<}KPuD3f`mwT*EZXJ@*O1r~T>w?|5r zy}Y1FjaVo7^Ab;Fgtkp~saH03O-?~n`ig7bjVz5=kx3z|PH;987}ZJPX|LEUE$uQO ziCKtRXikr84Wr`BkRei%n$-1NjEhUCmtYZ+1xc!vD;av}{C^)L%BO?>PyYY@2cBOx z0M7=G1JmGv;5Z;zz}LW6fqVi=9>`sI$#<2HoX3`KpQLrzl5*S7W@nz{smwP z$hZFr&<~CScLzF;e*usk|NX%C;N`yyJ_kMvUI`9>Gr-M24ZIPa{;$C+z~$g1@MHM- zmx68JMDPW8_{870`TJf#Z8`%!$=_Fjp_^;;V!G9sf?=tz9~(m{2v;ssqOP&U6X`Gg z7>@PR(PF%AKooyGfLKUpzD2 zNN!T?!R7{jKq5=`K-4=^o1(pPEzJG}^R|A-sv#3xJ4Nw%+^M8GGLr0n<$Zv%x$cAe zKS7__b15f@kyAAp2IR`tc=$Q4n|7LX;@Yi!GhRaxdL0R(GQC_V=BatSzNtMJsO@0M zvw~Yy(Q7at-<492VkBq4AydhIr$rTk-E3oPIb1|e-e+UmC*lh-Bd}s&89ih8U zGC#-|d+rl$j+$y{#bN!9tA~yq8^{$u9s#qKFp6E{!hOQn4FE1;7;8#X6X6h5HPt=8 zS|#u4=)~q&&*(U4Svq^MubEQm8jh(CjSuST$%ZB}!S`t_J;j`VYc z#>-o93bPB^cJ1mlrz0L>{@ENqqmJjaG$|#~jw!2tG+$z77o_Y>m01p_{rV3hf#f2u z5ZR7RtRVIJ;@O_b?eh!fYg2u(eC>sv`Ks8I$(pM*jbS;%tWfa61;4R6N-U1qixqk+ ze;3g)m#B_3>ogWx2H>uC3j;^iy?u;OwT)yYDM=I{r}0B5VG85PHB1Ls+;rWv z^k>Xe-iwkCu)g%>u&sc@X1zFgfO-w&O*4@9&cVS0F*XJ?xCXKTI$)#xpk`n6O1=uF zJQgTT-Iv`33w30QlQ&K%-*z>HKfiIBpp<8Kg+2I4S5*9FTP|U{s+vM>oXV!u^P_b; z)-JBW7_34DS|i-TO?+<&BEN?h*ga0sDmdWx zr6At5f=H~mE&1gr$LT3oETq(f8_GQLn+B7VWR`fcqnzdAZlI3pLgtXa&c{U+new^m z!$`Q)j3b5x1*Rq0ftov6V^pm%d&(nev)pgLh=EZQ*|Bnhrs8aPaB1>NYhhI)jjPN* zigw`e$ud|!AL%;I7?1t( z%Plptq$ZmqwT#Cs%GS}xo$eX5GYN^=Ka3DZqWP;h?B5D$37wYSlEiRynQSBqT zGIltG++02)bT+}oiowtS#?Q%OtmG#XG_V~w>b&uK9V7YthLWC}ePr#Cu(`0O_IZ4K zOB|q2`iAAT&un~;#CHl8tE|sM{j2tw_)g)l+V)BEg`o&Y;jrfR;WkqHJfRW}Yi}Rc zDEoY)5{@XqK9|L}&eiov=_5L@&scoFq!Ny(!9Gd<;g<09`lR@XDD0E&N0hF;J}Deg zhka80bPu3DDIAfAeH!sCUx50gaKtM1nTYRwm2gxu?33&>K2Zt3V}z&jUl1LV(!W!L zr}F8p1AS8Xo&9Gmwcl-{!&3O~M|d;7hb!TCiRY>R-&YC0YlM%)_kE(nQ~Gy{=ey$j zU?u$S@jSKv7b@Y$MY!||^*JpDOiKSA{#i|5+P}?=M%v?-kG6@x37i zT#Ek(@jT6s*H^;-Fv1@e-^a&*O!4m>&r|YwW>;rEa5RKL5%gh}z2M|dN? zCoAFo@q8k_b=QbKDgFxonb|MhFD6_HKR&`!`Cm{8KOw?X`EC^xGQ}T=@YH?}uY|9R z@H9WJt%So-+9%DAyT^o1>930Lk@)UZ!dJ)hG(SFC3CB9aKB@gz#e$I1UmM|RemtcT zeqw~D`SJBi_(>5y65l7@$^S^{pB&HI@%`*d_$l#xHok8b3r>oEYCKQ#*INvANPy}HKl)cgkKZi`zqlZ;(03n4=Um3 zM0hISL+e%vP( z{8YXNMR;oeM_0lxi10K&K3@sHFv2g5?^9#LNa=5m=Ogj`_)7Ri@w^@1|4|9w63^58 z*d7~BN`GsFr}^>EmGB2gc3NzTS69NfNBlHDK3@sn5#gzSANYsKh2Bu-Km6+M*P&ypQwbVeogH(cwdi@(of}1{rKcccp6u!e4nm_r+#TiIZnS{ zIsM)7Jk5`nRl+ADJoQuk{^j^rM0o0-M_0o4M0o1Qf3Ad2MR=;m;PP_%(-EHf_nDRO znFvqq_?=34E5g$_*&GBCZ=aD*v-8;d>)ImG2jo@cj{<+HXe?WGVftB0P<=S5(3eM0lDXw;%8a zDgMD)gQ~Td92)>m5BO`vAACIbpKPtl0{PCqJ%)M{w8I-0W?Q|NnJ-(mIgpoAUovgr|M-%aKwF|7C=y z_8W+R6#k0{Pvid?mGGZOcxvDOu7v+A!c+OG{QXZW>C1D{r{6l`ftRD-hq4bBAdVC{7Sz6roai{bGJut z9y}CW25R8dx8rY+{QiILw&?DI9bh}y1Zv==x50-!m;oc;_d)XU|6jk0-anWC!yx(k zzYl-^`dGL5m(RZEgK_W>unG)Ha?1_}>QnF8Bp)t@f55|95~l-z?MOIke4V-9LwZQTjyJ%Q#XQ zph9){PC7D;GE)OHh3vTT+7)STYL^|M>?T0b+wk3BHhqP(dg23axlU}qm2vXXzWlX8?0|7c}s=cI$B(W)tPOf_vAY2 zH7kuPJz+U)YU^H}h>;9IL;bL)#+~cbNr5w*Y84B!MD@Bu>qc4n>ruwSd|O_HTv2L! zL4|LLT2i?yiFLXA_mVDNW=rBtt`0joz!{qhm<0NC8muokz-8}Zih8-dh1q4eC5+p9 zcvEYZj_3;EmRZp;E2N`0(r1d|{9=ySm9b59@hDLDB;lSyE~aTp?ccpC9sYbNjImzR zrstF?_=)SQ_vbbnE3W4>Dx{YjlG)TgVzbGo=#iQ|njvl1XgqMRreCbsZc1GAg;w+w z;Df~$9A>smcTi~Up*31uOBfBNdEDgAPMp7FSC41D1Bx$KT-YtK<$A06M%_lSop9r_ z&fu2z80Pe2=TL}#ufU2J8CDx^;%v`_0Y@C8bbnX|=XKp6Z}3o8TNxL+{JxuSk%eRVPdv9N;I7&5^7 z1r43~i@}F(?=~&gdZ^eRH0;iBz*+G4GTm&*q>&54ST|T`WG~a0+Sg!_@vGJtW(>Kj zn(bD;6$)1T_rQH-zp*tJ15^$%(w!Jr!li+m21DbRtx_{Y$v*)bvKUkQ&0Ou zWFS0Z-WMEbpi#WWU@bM;8sl0_-Mp(2V^(9Bb*tHFWC!}7(dQ<)Pc?dv$Cu^5Bius6Vn97_yF8aA+)vkjU5zo){+o(f+o{=dnWKHr7s z{~CB5*a!B4dGH|cBY6KGf>(e?gGYfYfbIrd3%-L);C0}s;1X~z@FnB`Zv(my@G7t$ zOn`B461XLJFLHq=gC~J?U=XYXpGOvOEqEw+02l(24g3n|4!|b@`2)NgxGVTLynj(H z@E81Ug9m}*z<9<4{RP40!-)x3D+l~Kp;wbTuM~0)vjEU-4CO0}; zue>VDr%Vg;W|fFyxT{1+GH{T8B0eu5sfbr|9L`7OsDCUjD{T;Hki`-vV zd6(Mgir76yZR*`PoLW~&56P}|#i;UZ<_qsP&ypRwoQb|r^B!fz#Lkx(&_l6eB10N` zC*&3Zzb`n1f#`sGl=~M*!KtYr$@Q9f-O!7 zN5bqJ_Fm)4#ose89D38;aacKVR6a8|7=xhO$;C#vWmYxJQh7TO*2yWz(p@#A*a%9@ zJxpdKK7n7j334#^QtoZB;8?}E4jaRYHmnj49 zU2?7#c`4b}Z1!yQ`*jz0ml6$``(W|7@OiNDiA1<~rrn&Nos(g@w}hmTS!+~0zbwu7 zjuRjoY0Q*s%5lB8%u*S)Emi_+nO;RGw72(U4<|q|-0~){*4uRC{Lol)ndDW&Jr`=m z$Y0bEjm3>xTpv#~7GG3H_nc(-clo`s*F*B47)c0G_CJuE*LbgW&(nH7QEz}pm4X?2 z##%ZZS6G$oP7e9mNJ4-p@gY5=6xO{g^`$Vb)klfg>x3wZml1Kk1eTJRe1YOn|VKDaYD7Q6%gUjF}A zgH=Gb=%0j-e-bzy$oBmfAo=}&1AP1=z&*fm;6w26*MK$POYrO;0Urjk3D^&G{{Jm_ z^lyUCf@c8v|C3+9ey|+eAE?b=1-A0{*+6#yd##@fU>F{qt|y=&XSBaXL*U9@9u*TWT(nhWCHSe1es zpv+l~^fkDUsns?axOr$>*c(!Q@mSXQH|=x`YNYp!JS?wBGk8hYnT~qGZnWkE4}V%T z(Goo(nUazXD-Esr8S}>L9i4i(hr^#VYGL%mfY?DFjzhUO)Q|X-)(#B-{!LBpY2r4h ze0@ZaS)JW-)5pjzdOE@rqQ-<0ttp`%=Y@KVu=W3}l2m1$mUm0T*+Dk`&F2_ULYN*&Z)^N@cC~BZvw9b zmxE1UJ@_GfzU~41AMiKeLT~~2IsCou|9cpakN;bMW57@0@Bao|4C>(C;Ev$E@bx<9 ze?Ef&uU=c>7<1Ux2TJKLyVM z=L6jfptFA64e$~8eVy}P3v~D2esCqYH%M}T8F>Hmz(%0^0saU6|G8ic+y&@PfFFTx zg6D%Lfk%M{fj)2_@GJQIcL4bxJOC~MvJI$%TZ8Y?59*8W1aGflq)+@D?>o@BYUq@) zgY^Thd3O-GCPdAQzqRO#wd@`KQ0(E(RrOW)-Cn)+q*9n3_j(?xh3o1w&3)V@j6G*gbb?xBWdo}y@RPrKRKxn>B*uS!;*Ec+~YwJR9r zfv7*BcG!0|G#xb61(Rp%YqNiy=U+SgYb#$KTcVL+$}ruS8IqPG~h^aq{;!J9eIW_2O$Mv@LpdxmsO z!64<|{#`gj&qa&enze6t6Q-DhHtD3<7=#}$Mv42%Yf378t``kA+eQkd8|XAt#E2($ z#M}F~#I6K&y0IxqJ}Q?dTshjDmG@g*z^fIEYR*L$sKTlRxxH<63P-=3lTNmf7qc-S z%9y9(oHK?>G4mjwwV0jV8?Pr;+UkqYFzb zLM3K?db-ixe`o>^&E9<6WY@>}*wX!R_;jvF6rGsL96Ea;x5vHq{VR5+Q+dQ}>xeyxinX(o%xSS6jdOhWJ0#~agJG}LHkcMTD%N9T5; zDLkbGuJ^O9M4NiF-Rhw3j6sIa4hmP0{-&g)IvzQNR5zDNi4xaIS+5mNgYKN{qVWgd zJiggWW>??~Qfec)(LkP+Gs}k_Q>K? z(&UDQHaGE1v$Qe&7KoCS9p?Xm`Tgqsm8SCvl3um?|oQc%X`8`2Rm+J39s5Q~dufzV!JuJpX6F8^LqIBftgVWN;Vo3;6yog13UV zfWHL8;Qm0m051oZgXQ3V;Q8MO+F%Oo0g?%<2S0%C|0H-97zUStGr<7(6ny``fRBQw zgBfr(xIg$J{QjSV=YR`;UBEHmr||wi0rCy-SnwFI4fKQmh2MWa_$Tl}Fb^IK zmVsZw@BadP1x$fs!O!9Ke+Iq?{tRpZo51Y{T%x>^zcyNPmfoP&fVGcXb~|k9}7^Z072| z!7I?M?1ztPa`Tw8<^9JWKR%7BiwqLuz1*@fh3wqJ=?W|q_updp2;0B1Z^h-X?0apU z7vq@BCvB}O8&IZ1c>h4(aNkf}rlNhmtMbeLz(uIKz>1cvKKs`7AgfaWzT~jQ^~Rl^>{Z6E4;4~^mtW| z3vZ?Mt~>t-;oX|D*Cz@mi(cJ&H!S}SloT`AnuYUUsgewl=sX2$h~yCm(%))kyB-hu z5Pc}|l)u%HWpS|WAtuVjQT=_Hkyz3d)QH*=c5F8P3IR#}WU8eraw@NoOthio2&I(D z&wC}*U0m87Kd7p7`jiK)o*r`GngkGa&*}gx4&cH&m4WP8)DLv#cSWNwaXc52EX|vI zT$kR+$G37Ub$_`9tU#J{DLRd0RjDbY_{m0d;mzz>t@x(?R=El<>SxzGF_sIj#go#T zkU7;CQErHnG-Z`OOP#WMKPz~TxtN6&CZyu14n1DiYM_GB2r~Ay0K0dyui%D2!M&2nuaB%GV*y>sFa%|cT9ohA zl(as1^3-6bX#tXEVREm?m&ejPJmSl6-lfqnbuKNodr-Zkb#~kh^X>Y@rsu|wts2n& z>+1RrD!AUP3JB4Ujan5rQK%_)spQ>Ha(X~7mkM%Vcm zex<_R?u9bzK&nI!qzbx>Oi7fgy_5vPyF#AAw#?zep`&|~<#D6omJl%!yoI>w5vv5J zSNnU{+gFexwWZ;duaLpDK(CCVs;p6&jz(j0XrSn0XL`KjXP&E9*X(KhXV2}>vl5UI zmH826DWc*peCVlUIrThwwVisdE_htgj+G~QBu+i+8Vwidzs<1?7mqfLwCtxmak$A} zIsy>?uakhkjr0HHe_!YRe++c?e;-f>cLg7TzwdxE!71SG;5IXgMUh2XK^G2qeQbntn2 z_RoQjfk%V;gAc%~zaPkM|B2wCU>kS942MqZIy*iaSn`XtYH~UU!DLdq$+|U8;Z$;vYc>zy zSY#L1vHQ{8fSn$t;`l@zcGI-|!&6*GJG#H0rdED26APy=+REgW$+RCgH>b~Q&TX0* zZ-t=1H;>p`4%41xS-oY4s*Nf0DQ7G9zD|84OU+B+Kl_)fAj?;b7@g=EhaNJBOS9&c zQW6%OXSGmGDyb+oRlt?sOtQGiubPc_Gg4gfiwBg4rA@tbNoh^_+%Uyt9VbK4Nv@bG zky=Ak9yYe4ljB^(nhSE3=S?p-RXOb<;MC39#iIyrmxi_G_*syq zYFRF;JJ#nB`cF7jmW|--Ox29ACi5{pRKIuz9)0IIMlo@J@64Q}P#Rr!hB-Hn3B*o2 zrBP7j5Vr4OGbI67O%1M3iNYH13y~W;-Q?KNnQK3)YI;8LNoqc;T*pNj=-{tjH^?vE}glbPt z_Ga0ojFe(6Jx$-sH2)g)vp1}?jnrBjWeVA(@5Uce8{%NEq?glUTtjRPUJyH4`RnVq zZgTd=U+R&??zy?y&d{n=6F3u_-!+IUsa4yyY?xl<5mxbgdU9^n=*cIq89i~d(Hz?~ zcKYfQcQsC1ds_3<@v-so#>u-*K4t8rQ%`QJIk|aiDTAhIc+}U=vAph@pKG4mZnscz zNYzWe>p6lQuRq8i&5=2k4E9&)S^|}4C&7u0_#0=L~@0{g?e{$E<{+0NkLJMoA zIhK>v%5AQ_KhM6~16_3fAm6oAIDJQt56S}LyxIB9MwBhsx4{egI%{&R*397U$yv`? z^Jvzon${@%cFLzTko#s@HU>jFjZsp_tYfs2g?p?ci52yA>ypR1QX=D`fQN!FIauUK zXy?Rsr0iph>hp$|eTh~&iLwg?F+SQau%;>6om_?#JsrKbetK;C^yIlozh|hcpO6@~ zSN7~7jRondMN0nFHMCW(7zf>hVBy4lLm*)KYe3PtyKRcf=vyQnp{QL%a~CwT9|+hs)W zDy)a*OG36Bl$U}u250xr?46z()PKHfL8WcL(;h3?Ha|kznw{|YsL2=jY^e20{dhY2zYe?@yb#QR zQ6L?F`r1|I%w6H=yS<0$_N^AS`JjPr z9`DrS?Z_g3DRNGl(_L93FOVnQmF8`9XowwdZg*?Or8hg&Noa#x_n}Im#zUdm{4ulF zX1>4c=6F}_SoB-lo*$KM?VzW{-mo3NV)aHn=^LxTi+34#P7FsSWG=lb?!-)XnPM+N zvqYS9ZzO#qDoH-p9~6i@7n^Ci z`5zzM)8Sk!m&XwbRR~4S06+@9qQ*c`ZRUqGwv$&R(I~u=$qN^F$GWm-16fP=({rk?K0y&J z(l2?N9^YtB(2~a=zXyBaiE!#(CUF-}E~J-PC-qDavZmk}J$qNyFwsv{LWss|jjl2g zQ*$p<$Q(jwn0pg2qoSv?2l>)b=&O3aNr?(FuM3*ZS*8^4m=_9W)JWm|k(#w&{CAGq z^q6M`>#95ty0xYK?ZaQLi+t$9pGY>zoqVX7c=M?jcmB z?JW(nwDL8i(q9rFlFB(P<0gG1(aty>(Pm-Q$z#N%&6`!RX;LqYw=Gq>5gQq@)mO8$ z(OUItevcVC6kmv9s|=l$(w1(t%O#VSwVR-NuC&6coZscshH1;wp5Q%Sh!7S zU#q>RBTFBdkzBKOz9SPZ@0Ec%(m+Wy@PXNuV;Wr)q0&$*8~A!Ox7$mjIcI&zUXd=n z5W#|VQ42dE1XTUkoVp)lMAyxr`vIbXN-_hAz2we=w6 z<03=c*L`pAO(p-3EpF6l&MFBX0oLVQprb1;%$OopgtQ*0SJW=rPNHl}s-))~iwcRb zQAWFKJkII9o_)p*WM}XiM6;7k{NAv!q;WH2-OLVmuAK>%TcA_}NZ!_{n~GXxJcXJa zdT9A^nkMF(=vQCxd#p^HCnWNsUfGgs3zHIwipXVCCHl$998C#d6w<-!@aXJp9%jAK z^8mK@JB95xf1#W~{yggL4;CjJvxC)Mq+px{ThEm9Qe}#vOCS)%|8Ii;d!P7D`2S1! z(&s)G;pr#AiQp^n@LvY+ z1TO*d@t^Jk+{E9%grC=W|C7KTumSuCp8jRvVc^!_-SG6&;C|pE@bQlXgWwzR>wg8V z01pGJ!5x6~^}hmiuKzHw6I=wAgL{LY!K?oi`~*m6|2nW6On}>i+ks<%c=+dn$Abnq z4{QYIf?I*_!NdPEcr|zecs!8bex3WT1h)s@p>5v}#NVr(gDUk^DjNt|uFl7W_olm^)7R<)=Nm-rmC@67Y=2>Tzb6iaz2Idu`YS;RAJRSn0gnY9SkZrMeXc zUg>$Nb2jD@f4E_ASko^?@Af1=@!l);f(V`zT$G7$V;1+6**M;2_S8icV~ySHS z&!MNfpE~CjvZjq9J99~s3I^}kP9@PXYNa5h9Lg~jyZBj_tqDoGBvG9>n_BU;>fg(x z^$-peMq3Cebx$SZgM#dxY|VFQoVmsx{3R44E>EL&xj*G>olXr(32J(R(T>AQ3BFPv zr5L4jifQU^PbVBH6FYa8@7A0qd>upYLQU0~A00)jxRl^>bb*%-{~@9GA2b=(znNa7_X-KbsnH&fR^c_{-ButJhR*udc+tik$a{m%*~w$g4rU%jHdw zm?_0x-_#o7nSn}&AYi0Qh$KU9jznS-;fx8IYsJV}(X0@ZZK0wyI(v#zVB^Kqwo5A* zI(DiIQ=PV#zywQY$tY9ZMfw}BTrQm23+h#_R2igM={!%yGK;XfE+2@VhoX^5hvB!c zM}pbwS%T36M1sg{azL*Mq4KEEO08ElmwS_`H={sVGNnwlfwX3FHlP$pr#U-Xo|xPl z*LCcmH`456>7a0uec%DNz9SQ&>%K%5XUYC)8>g6!*Q#WCsSY?|7t(+p^coEI7Do2N zNYa`mqQ1_@Meup<3(NoYwi!K1>RQvU;zv3z-Al6GNR)s{YilWv=WDCQkZjIO467hEs^fuelnqv4IWO`Z(S|gtAD!K^i z=9!}_633dQpEzer+omuiQX{+omr~G1iH=k6?yNQj{UPbb*t{=wanE2Wy==0Y!>Y6e zTeK`FIS-n)38({y{aR{@=W?Yq6}!8D%UIt`?bVk_s6ZrIsh^z$OXXiyU*9D^aQcds z!-iF1K(*R(W|S4c2d^D0E11{b#P(e0DCsutXLh*z7Wy zQh^E^(Q^DDN4=vErGk}4qxk>x;lREGUt0A4r}O`3fTx3}f!*K_!RO)k{{+bQ|1Pit z{2YE?zW?40-UZ$XUJWD$OGO6c7rwG3&;bW3U+{7fMdY-kqf*5{5^OvI18KzZVq0FoM2nX3vR>r+mR2v z4g4dx8ms}58+;16z`ue|f)9deFbrxya)LL3H-a~S$AfhCzk$EMb_{I;WaIy0Ai2R+ z;CS#~$N+S&-#5T>!6U#d*bOGYSwQ~(?hS4StS`v6@X=U+^L7?M`wk!RJu!x7k(l~b z6WI~m@aE?9R&H8d>Wfq1Y43rW0h6Ox-Ry3SIp3x-N9%Vi9EIGTc!mT*K(+DnV|_GqO`fO!A!#`h(^FGey|T#jY!H1ByIA@ zbc3Jh)_1q2##MD`utW|4p@x#=SmS zN$SOsJ#G5dgXPL5CiJWOdE^jdi5d{e=^Sd^ORsGnfw>WNFpWVWW0^YD!Ojcznh^ZSY6F?{bpS zM1@miBhQ^>LZg$iTf3@l;}t9ILhq#b>*}K-0Xl=D9OCseN2R!(S1&7oIrQLoEWJIv zw>60~f)#^yrBxguVvHbGU$!dg;6inpcnn@}erEE@`j+jvfP}Ob@G-9>)$pnNt?~Nx zB31iQA1t(y-4*9! zDOwC3&z*!>21;Vy`I+F5F`#I+by#j$#MDC&#w9QLa80RtHA>^6A4`%#g`^(jM8=I+ zxk>>Jw{{lYbOUOsV(Ht~#5Bh%j^olhmGLZbI<32vQEoaabd-7t4XHT{pjf(;=Ez{D zh4G+->mgrY8_RMpP!`H$hwh*c^QDXui9$2G#DFNx(lKtV{YK#|v?Nl1==NBKm03_E z=#AwIZYWS9BBH=1Sfcn|Dy`JepoG;7m$n z39FTG$-26x3IKrUS>pVxhdpnjKW#cIi)R11H0>{;EeJBGW{z?b=_{n4vpz z5@k5vakhvk$&g{C_)` z&QHPXzY_d4cm7r2f-}Z2~Gib19t_t0=hro zW8e+oW#CW1AA!w4b_5rIvp^lxz?bMd`3`(0cqG^Z&cFZQxZF7jXONmxN5&W4h16By~}E@5EjS9=HQ_4xQC7A zKomTcwm)j(yNS^RDd-$CYxnnKvzh1Y8f`tddmHAweR}_};uK!WLO0anJ&T*NL30r? zJvX;;f1b&M;-%c-cBEx5ro`6I>>o%%`GHJ5eZcMSJpyKy_%J|V?S;!|?o^840Xpe7 zL|^*Tl@!u1=gG2PF8Z08+?78`Dx2tCYQr$DWR?w4Hz^`;Wgl>^#10=qj2ELK_n2F! zpi|9hH@jey)TU`$Kc{K?XZO1(U>{cBYTq$g7UwJtBNo;%zmUTf8iEtpDj`~FK$vW6 zaB?hX=R5)%c~(zJIbg<)=lN0jlR=_bjGd4xMmbL?3ZZ zB!|GjE5#;Sl8$P{tIHH%!Ao*I(PEQ8=Jnf5qZRpkvx!fqLJ(IFcWMo__v-N>=fgLA z6xonOGhT|D+Sc)6IgE%}!YP-kL`Swpjm5g@g|nJI>y6D~ML;S$!xS<*GR|%#C(g)4 zTPk4ClDiI*>KdU1i=}mTSWm3yhEhe7E3-w6X5a)KC?=2Gnc+B!us@?5Eb^(_gXB)9 z&)IW@S0`iF+|WxYCWEaouzgX;)U<+L*ZcbP9Gk|!)GVC;vdN|gvEiq%Xilv#PNt`T zrSSlVzG6n{RY-q5@=E1&e#Y+WKbsLL&qy=pJ}P>E~W+$ zG(VIPGW~R~M$J>t^&tDJ3}hM_ab}|7=<4{rgGu1wVHMI2D}V8rX`{0WV78U)jx~q4WOdr$4^?N%e-nj zV1lNDW}DmJJ(Z%p3T{cNWyJ%bX*x#Bh{58-T|RZ}`M#DK!`bCj%E__3cwOFi^wI;n z3)FSml~ol>C;Vik-GiH>$_HQBdj*3z-}FjyfXpRO^jp`SlC}m`pYh3w`M7}Ds*ZUU zW3|c)mZvk#Gcux0P#iR)rZGP?hiJJx4R!A2CY(M5l(4E;5 z?9wIeWj0QA<`7$9)(|7Ll$5@MW=#5)rAEn)@ua2`x~GhG-OrMxktG`kC%wfiMXRxT z!y*@Cbdz%k6RwcdRRvEY%BzsdFpUb4)QIv5F5KuEquHcMcC#9nwtAFw1UNmb>bDi7 z5Ci2i!R#B5qKgP1}A|#fS)>nPZQw7#E^v47DP#bD3tj_sN5E=uJor5P|L4FP!E?YhUC61H2u)4M<+_$6yvrfXl#zK>h{q2kr|bGk6+!D%cE;1)oGV@M7>+ z;5lFyxI56j0Z#(wgY&@Y;Ct}@p8=8;ycIkhj04>xa4+x;c>Z^RzXZ}NxDXr*z5#D9 zoq{)mX)p!$fOEhxK=%e*N59R11{eXlPf&e%AMm>%`)D}m!x}KmL55kEBKJ;9f@>%- z)%>lcycjylnPzx4H+?NC%s7*gC8?E%vO-Bnj4dqNo*#)6EQoyXj7j$R@VaQ=&hCobzM zc_sFll3#j)xlb8;Wu%a*^M4~!ni^qI9AO0tMQ=sdD#ueO^hy$&W;nOx98IjMF}tapiS%_-fSSl$P9loZ z-!+AkPLDJS74>&JgWJ7{d!$@=IX39XAhmhQ$df9|-eOEI?B_D?rtH-o8Ad1j%hV~} zWj=L@`Bdwk6#tQ;_IptJ%2HD$h>6rI(gW3FXjTL=Osc%t+TwHSC0eei7vcTkN_z~O zfi>nLg_WO!!%Cwe7tjMI0RmT->OLvId(r(8oOi$N0$rs|S zaiZNw<&Bg?uwKu_*iwik)O%`-bTLQZ6(|kT=MCbgBT;U8)BbB%^81*hrzlRj8 zl1eEJ?IotMoKjtaG$J^@O0wEcD)xldLAfBAB3pL5u{z9U<9;)wuP+rUGsMVr@m3h) z_+^YT%<2**TAyuMuJ@;c#-DfjwYh$XAABx&M?CU-{UGUYefCbJ*yWUIX^t|M3 zAWBkhl&;{#DpaY9`2X`5eDXgh{{LaYF#Z}o|G&V?fbIf#GT00*1o8oRNAN-T`)k2I zuoujOt)LF>0=^Aj|1Iz|upWF9Uj75%{XjgvbpGq$+v4TH6Tu_F47e1C$3F&$$A21_ z0Cxs=0-u8~|19_ncoMiA$X9^;0p1#X93K6>pamwtB|tU-cLMSa_$Kg1@CNV~U<>%Q zcy;hy@bBPRU<3FeJo>xAb>PL|8gLL?4NeAMhFAX*covue)8HZCQXrpzZ--Bx0P-2| zLwNFk08MZX_#y4Cw%@_u+rhcur~Ljp_yTy}&9u8~<;PfTetH*-780dUF!#Q!yI~C1 z-!1XQni5-ygs{)k89cy%8OW~!8Za9bJ3To#cpx~P0ZoU2Y-9}>_kOT%WF+xPnZ+@* zrQaVNYwFLL)IX^}iq9)~C1h6UpvJQra9=vDanCY8Md*rS z%wM%d3MuvAT{7SQw#jKv2=;#wmco?QMS#kE-asAQh0OK88b7B^j5sibm<`(af~zpu zMNub2M7knMgsX3T7CoKoDF>j<3RfSXa4^cbjErYdRYzfcX&T)%%Ut<~EG|=}6T2Ad zdxQ1!Tl?fj3aO0_b@wCpo0_GI@jNAa&W7o~*okwNUJs_XpeZy`A8R!`au|*)I)pq> zg<|USNl48ssfv(dWCrVl$;L{q5{Q)`n+VBVh=f`qGTr@!9YN40ZZE3#LT_^ly;(CH zlIkHYsqn@x*BF+ek2ZJoP1w1DN z>^M4N4itIRl>)H)BYsR1er3u<+h!I8?76eCu6JjHcs>?Mt<`4GpB_a&ijiiKl4 zLb=K=Jvf;6opt9>o&F4ViA5~E#+_KJCD6-$pIKj%_gqyBoMoHIlCyZ-(&Q1V|45!^ zMR=T|Vt1p4igon2WnOHW3I8KgX8GLSJWh2e4H>NI{%kxhnv)(oqu_11e1yPIlZsj?M@$9JdL``Ymui?j4A&<@pq3wrWj0z@p zs$MEjS|b*rb8Oc-gz|2GtW8R~vObCW5#2)t*Qfe%ZJ8AF`>jV@B(xwoq{a4J_f*IT zNNzwiH+s@}gR%jJDvrYnQ*Mz&G`8SeCR!>(=8D-gL8SFk+?AJ7BOU5hjt6UMjS@9y zif?62tsCLoW^Q(S?U8iIxe zJclB;&!Z~gPmbr8#rM}L;ZKR@WAVKX`K~@G{ipiR+QZ}fg_ZE9#q)H(;W2CdL5lzM z2v7BUcqRN95#EUJYb)V@6wfE(`;I62A1VDmj_23J_jD!vPvUth|JN(w&y4U?zKc%s z2Pyq$MR;nzmsi67G{T$l{nJYLv*Y%S`DFN^ThKNmm1AEfkO9^t8ee^Uv6MT9rv`rO8BcHJeB{{b>;M59pR~bFRFyUCc;zu{iG89+6YhcAx<*9~$3( zTM2)CJRgbg+nwcqr1txpc%J6RWF`FC2v6<*flBz_MtGVZ_kEy0Na_Dwgs1uO$V&L% zNBBs5f20!rhIrnN?^WylkCgr!<9V7N&#Q#LDZKM~>M@x8Ya{>gZr`uDSy@PCc)G(S$<>?$Xn`3p08Rk+0`~-8ydB|S0`!Cbye)nLz`uhpfY*R)z**oH;27|h+mI)C z6&L}hf}i~^_VM6ia0>X+vD~!>J_P2$y}@U1?Y8^U$KMEU33P_P51a%}1fRVXdi3B# zaC>k&@P=EG4;Tcp&0hvSdkeS4-^BO%Ut#NXiv523&0c9wmm=T$X+gH=i4a$U@9zMOPpjQAnfmxvwR&e+D!nG z5FPTxn%=TFXpgzDv8O$L944%D@!*PF>aX~o{h<7l;4!2hb?@=XD^Yjdbwzki(@W)> zw)8}w1q$d%)K94I8K~o&!d{L4bV{hF64;W%Ah#Dxb1iXx9FSg4X>d8nV1DYUP~?Gr zPqfOpqJCz&Cpy9kYTsml6q9q)*5D6X?eTQuv{yx-z9lnvoV9%5^v;w z|CR1AY)tKI?AHN&_@lyF?W$(G<&s_{aH+0+8KQZIM#3G)kVK#bHzsH3_e$@`l{og` zCZW!mBMb}q!S#MtP8@R7lwdCIx1V~Ho0sSL-Uxz3WtBek$KhK<=co5}HYTBM9FQWP z-8(soyDh)E6oGAzqv3cr?hqbv6*(0sj&=@TgF8+XYf7YW$dI}3GbM{~&+9h+c&Nsz zVRX0o^x2Jrz0~3%_0OI2QioBb|9IQdeT?|p!da90(JU~{t|OgKjJcwSr2OaujP7Bu zTZuxS<`LZMB)=WzWB!O}ZJ-YBw^W(ig03q0yj1USERdht>sVh$`Wo#z(|41e;ti2q zMt$Y_P`_T<04O0a(seWy6_?&I5<$_~?XHY=yp zoP^B{`WnlY)0pD)vc5jMnNUY<+(s@10C`?S#m9{f=J4w$6C_kHnmQdqBZHr4gQM2w z!_?ex_^@o4SHGBu<)JH|)~1r;SC(=F&z{7xIqEiMoCujAs2gx}IDp;k+6$YrBN{Z$ z+#eqrHFny>qK)_Ys6}GAD`EbX~)kZw46V0Jb zVN-TWuE|G`9go@J@+e)8>W{TtT=Iz9E|gzLlzLx0Gu@btob18o z26s|MmcDQG&eTR!uUzw?#>8A8?|j8mHRRsQ{O-7!MyHbM$Vl=nCeMbH&Fv}tPBsjp zJ(qHlT)k0EhC#QoH6B}U-Q1sQ32L|Y&7@^L=m9$=)-uDII5m$Cd9??mT|3U;v-Y>D zLT>2pxoURfI3QysDk>|cvT8k>^X@h00IV23yf<;m8?!`nLnCjrtaHbqwa5_M>2ZoJ zx2r)on4=lp>o*e#tBmUZF-=*&Cef}Fbe)2+W~f+ERokLilG~G`c~3_tE)YG_&{`n%ZFWU>Zk8EnK%QlW)@q$A|wi-cLeL>%~n*iLku#mUhjtzBlRs@P%4euwX4^h zjy?!mgU#_X>iA(#`-Rs2n6hf4Ts2nN_n=wqg+gSr0*?u^c(!M*TO{9HJ9mM6?S-EC zs@U|8HCJmIr*#aoLcw##7HQX$-8z0Vtk5#_DYo548i{cx^|5zSFc!^Y(^ZLL^Dn8Wmb(X>^tM&_)_FRdb6rm&!{SCQt`C)A@t00!*z1Q!XTTMRCMs^0 zg^79tcFV$cnems!?a|6=_GoP`=}Ox|6#kHAI|ZwvEcL{TI+I-_Npt%d&owikS#oxT z?aej(phmiDllAlss7_(bXDi#%Y+<}^Ss9kX0sGwPZSlf=T9XKc%wlC>lNZw>jWxVX z&i>Zeyx1z|o#jGhH%jPj*V21C(iofD3c0`8crRSptNA_?LanH?5(g>74JHQ$<)dTr!kUx86?X$bp(TSTY1pbZ_x=}Bl=`s;I7%0~xy#$^7s z4{%X%w%X^i8Kb#I`V``_As^imv0yfvb}nXxWIZE&M%DNnF-h@)lZ3+|u2yYg7S?dr zFo!>13S+F-wCOoz3NF6;>ixNVXT|lLMuqfloY~}4^hj+FEYT9ki9I@Su%=(E*lrlq z%c|1X8|S9zaAr2Av0{^YoI4fQ5=MiWwc3%4t%)!=4a9)jp0oxPU#>U-SfX{;qI>Nq zauydvcLulUtfDz~4riVHUV#;9^-^uP5u`nbb*cFOtzt9bL&g7(>u)~)0H1#yNOu4{ zj=vuU+Apd`#1n&Xw27dELN#4frDy z{3W;qB%c5;Lk=MMz+Zp|gC8LScqe!Vcs;lnoCHn;_XE-^crW-z@G5W-_yce&pg#KI z&1$^~s}jf9&=5DJfkO!d?8Tb?*Jq<$KzwPVt%H^P2(`}%Sli(`^;Il?tGITEUE9aA zuOWlstwQPGV@{W#*v@aUhP#|sF#YzqC_W?>g_pZFX&k!Llx}j^KBzz3J=|}Q@>{4O ztf(rk9in8^dtTul%ZRs9SH(H$FSfm<3=JCXFq8?jmD~SqGIXJRvg#D2OkDQb3?WO} zc2a1b3q{fz5h;;ZI)5lw#k=}M)jB%~+M#1b%Qi~R&D@%IfpNdYb-rj)788DfAyU=k z(Ll&GY;KYnozX)_Cv}PGS)j*`PU_?fDNUN=LswGb4tW?*3 zHdn;-HdMBM<(24!Xg0Z=sc__7C0eSx3v0Im zN3?kJ-C*`@Q5i>9B`VEYDCUjDxE->!i;P=Wd6(Mgir76ywej3YOjcJ(56NkD#i+7d zi+JRTN?Z?aN=>kUI%%%UThm*Tjdai?J#;ypgi#!Q81LfVj;iJ+h2%r2=9>1y-U(?x zpx%YbB1&i+SnAqTV?RPhtk>-1OE2ac0eTgI4hm;*_BuIB&Ux*KfRj$W&R0QerANY$ zmG)jqyQRl_c801qUF*im$qiPztfgkX-+jc?`zYA_ZJ|>w?<|XJWQhNlsp)k(Yli<% z_F?}FkN*+yVUX+q#QPr)z6@{w1n^)We}89#AHu`G5F1os2~2Jil9@G0<2@BlCb zz6%fkLa+_o416D+{d++7{QU(u3;ZrP7U+Jz*MsMQXMs)Nd~hC+pTC#FyT1fH7i{ZNY88&A^-C*)IqEU^zGr zd;~uIE#O+P8_2icIJgYl3Va(r{hQ#`KzH%22eJ=13&=*`)3mkP`xW4Ypv=&#qj4_P zoWQ-ooP&S42Q^lfhgK#b+|YJkN?2@TDE@1 zjEWoO3h7pQxff$1$=W&cAFmpYJpjGQ>>}JArv)jck*WaAldNsd-iyNU8# zMDS2lsx+;E=9LZI1!!KN zA$!k?cXs9k$DOgqly;r_9yRCsU;Or|QszzC^steSs~Sa-OD00UK7th*(X{*)WXlR` zTAF!>vn2FdyAEqohR(y$c{z9ETisNSWTNSd?kf)|s5|s{DzD-<-dM`S2cg|^bcJtl zR$zB=-}oabEz8_EpO*S;WwiyeYthfTE3NzU9kCItcRiMUTTX3ZJpyaKC1!iW5m@e* zxX^diw)a}U)MdVxobAvvF_-FR#kpyx(v59IIvL0}VD3&j6_2|OFj1D*SigL{BmfnUM<{}a%?fA0lT;1=K* za2@>rpMqzBCxZt9o$p@@c7b!i25>U?7X1Ief{y^*@7Dq|-~w%`P z|HVM|0Cxp<0Y5+{@L}+J@I=r67l0Fh?gEg!;9tN;!QX>ueEYa=Hq?hT!B8gZ6b zs!|J=s?euQ*N|(tuP?cejeBj9H(cEydmxlZ&scLR*@y4xa*CxUupsv%VXc*qf&Pl2 zn+71$Z)L$W&MMp?V3<)s$NSyVEAk22-xq56iY)3RamU%k7>h=B^D@OB!dzx!FBYHndj8(>|8Sx7lHQ!_HOSglkXAqN}8OY%N%Mp4A|)ZYiGPF5H5oN&}v9nl<3s zlWxogJe%cIO*nbBHKKo==U+SgYb#&fZL3+@-?=JUwsTc#*mU&ivm*>id;!he5vlh7 zPg;UNU)yc#+(C%heKRb_Rh-neN-&M4F0iwcMbX*c4H1F-q(@H6@ik z*GmvT+vY$-e5>Y~B)CDDMSK62c-g^&%NV;P#;IjD3|EdeXLa!k#^h=R<6v{CHr|)i zf*3k=pPuR>3=6fflVnZ_)f zJR09S6LHXqDVtP_)SNfrVNzG)*~PM0>PF9Q(xuWYlBTJtVahf?=RZlrCrvrB1-)uVGe(G;H20yhJgeg>4!XuHJ?8;qZr9Tcvp$Dg%or4JT9 znJBGrOFa$ErD2KCDV&Jrfqpo3=rLugFL5fW^ZzXjAoR+O|4;s1-vgijZtyHH3FHUx zSa53~-v5o@nP3c@3D$vS;I;7h4+2TYzs}#%0r+F^6d;~o`u+W2Ik+W|Pr!eJ-~U_i z81PVV8IT;{Yw-Ec22TY~0Z9h1fxqtyeg=x8U<%2d)4kU^}=E_$7S)Tfl38 z?*5amz?b3gzXV(D7Xwf2=s$H0QJH7e4htS2jAoOXMlVOz7^yj z%~gHMEXIs^dC^@Zq4d#?ivySN;*5^)f^lJ6ad&#H(awtcx^>B&UMaV6aTY_t_#7;< zJR-YrBVh%M|6)t)YfO)E@0O(OGQQCbqL5v7yNc$oJbL*pTZlN@ot(yM+NzHov|Y* zD8njOzimLx6^D_y$LN`Qq}*x``%CyY-09@a(Kv`iXBHT)s=Ok>!JC8vo2G# zvZu^6250xr?46z()PLuge?x8w+khKDE)(B2Z_{yWni^>^)|8q)*--11$oV3jnrH9d zR2m2#u}W1d_v99A2?IwqI`u_{jfMMtJv|1^MVD;8&{}o)92R_kgXL!QGRy z-bzJsO_e#y1N=sKfVNN3!uF+8HYm=?cJV-vjch?yp!)*tC6aF4G=JaKefZYxyZT`> zSN9EGf%l$%+pAX$x`eR*_~XZ?QPPm#+juV@%_(H(9&QjuUOd|D$G6nqonm;Z6lccT&+k&A{c8)cQthxH2ymRACH zpg$9r!4@Sk*$R^p=kM6D0YY)frwJ=qC33sjRN@Caj89hANT{YSZ74sz zu#mhVOZCb(UMmRgVy(6_#6EPEJr5pCCCe(*yyc(qJ`w~fs*2Fxn|+~SE}R@KcmPu&;Gv57iscFqYm)v7)EwjAQ#3Q*gd9Ub;>GER`4G0XJLg2sW_@bkJt6M@@AB`$IB5?oW7PcQtZhq zdA~Y^naKH2$tSk>|4lHK?-YN@`G2}!=ezLv9|V669tGN<0saRb|6hRa0=O;sKK%X5 z!DGRx;GWm;BDZw;Mrgf$R6M%@CV>-;I7~<;HU8Vk^#I5>;d-$ zKZU>l3HWOup8sLs4nVg39|MmAr-A!{`+|?c<3ABR0bC8%gWG`jz~f&9c7nTsufxYb zA6yFV0_5vYZGAJsKE&@wfHhz>=mVc3>|H>8sSmrx=$=l!|3cj2w;NMC`)2oZJN1m# zLPl5LXo%CU{nmo;t?S6a#HD}Fix+%Z) zZaJyh0c$T8zO_^;eTTo4C8c3ArgtLRKXF?;P~X~9m_}8qR??`^{y)sajk8a8Nx&J__>bjW1P6yCm(G1a}x!qox4JVx7v2Z7q{Gpih zHMv0^ZyZ-1}aO#X&`L*Sy;Y>_Ec3)uwjO zvU{;hZ9>7RkJ6X#at3M#_s(2TrYRS^2n~rRUsK*oUV8RUUNtzoe?(2pmBje(;*hSM zMB-7ei(y*AMMcqq$WAT>httcr0#qt6(7*KTj`O(l2NEn9??gpUB5zW7$sl^Uj(oA%CF4>SYRE}A(}wk1{4^Uf3DBIsy$ zjmJ5;a@vP~?!vO8K{PwrL_v}%o5szIbqw75xqd%!sRc?kz{P!Q)YOQ)GM+-M;D$_H z9P@3co_2;FD^s8Nw@^7czR{kb=*J(w2j?LZotlm@WU5Lxq_Gimx*tkFIpiemDB?3j zm%a|=dL|J&*2v+}+1WhIdLwhHWx2$Az2DJ^35Jf+>&agzXOKUSy8DCS4THmyJN2&2 z@Kkf6F}lCM7Q0@lD(mcw){`mcrOFgToq=7%Nt`FLaIK4p72hh;dI`mAHkSJHJA;BOP z!K0JoTpjN#lk(F?`#v)i5437bMNSK#CvxNu_Ek<~QTaGnfk*tNJdmL@<(Upbtx21~ zZPp5#ec`-aMV**x?ZTI&Q8l)CwK#bw3er@kvX>pK*tGboKk^Zr9nadJ)o!#zncCbu zX9PPr6y^A1W)4lG($wjD*i1>TRaUK?H;f?%&s8tquF z>l97pudm;_iB9EQ&tK}1#qPPe+0M|aRTJ#n^ScJQ2x*lLpH_KTmPy8lmp|K`BO;0{1M z|NFtq!7#WO+yVR^xIOqD{Qd{P%fQpXZm=Hc{QnQYd*J=w4c-MV0T+WEpbvZw8Ng@3 zAAx6p4rqfd;CS#8WC1@0vITf1kbK~Gz`No7p9wAiYr&7;?f)If7l8EqZv(y!Z~rau z0q|t-a3FvG{|j&bkKmjX{3rbTE5LRjy?wRmIQT(3^<=Ma4zIb|i&c!9LP|SzM%XCud|a-qcae9i9h+QA zZFvYn?Vg+ENCZey%Re{LXwC-dO2>;iXK7Y?8^x?U`7W=RzJ}CNioL#<8TE`5fD~|u zVV*7{V~wXt+?kjHN-2@7XjTZywouU;9gM5Bl-rB8g_$?zr4?)!$Guc0nKo0Lar0!D z>Mq*fBopPrX}dvtT87qaSy|~kPhaHZ_anTAQ*X~h(P#th#`}6Cn7y7Q7(GBFsLUpZ z?g$*LJW9~^ydKqD?oFoNi~?oJlroj>acd^$14@B(n)9O?5D}k&0AIMLOg(tNB;F3j z1GYv4J0|S)9TJS33vrGNV%HUH?=lOw4Zvh4Vzw%oUW#5_*_T8xk3nU7g%U?0PLkFv zQT25`qf@X#h1~mhpygp6O_M6A-bx=6e~TWer6=iIn>5NHTy>P5r0;R=T!1~gB-b9ce|VJW1f7ff(@=(RWYH72Epmll|1xWlmrHcuOD zH&R`t>6?vd8`7wAa1E9bXr;tOqtX?LWzEu0XL&Ybw9R4p&lGXLY^4-ff6#%W-FtOs zwK3=qi8sdPeW{C!4NK`|lhrP)vt-qcik1Z>)zN9|LacX?x3JWhO_3y2&_ty)6}!8D z%UIvc#Mzfgs)$G{^>cMHuI~~c_|`zn7&%p%&QfK~3gCnHY%F#5EIfGQ{)=XR}F?1Sa0z)qUe4cg@MBz!JJ5)01;XH|zyER$ArTh{Z9@AC5W=%kK} z85CLMFGbEtKe%h-LO`I4Usu{h92%Xn)&U-X>G<>*ZsPk}!hG$T4 zaXE3Yhe!=>s!sJ|dd=!A3TQ&s)IBd09y|S3%WP|>!_!+5yd-vWWn>Y@O3}_tQ5fIu zLzXj$!GQj~8r^uR)=&ST9J&&sQ%n+0d-AlKd6 z>bEs#;_1EhxDz zkoB`E(y_RI%&6fc{cuHv$Ir$iF|US;SF18*Iip#L`RO-NVBHFg9Wy3%!lm8ox`>%rh-{fz7-iFLS4DJH1M!sJFCxhQ0 z%l`z5o0at?E;1S?rAf3Rs&;fiCd;?q!4uDgD z^aAe!?*uvvunTMf_Xd9m?gefEK8jvI_X0izw7@9101Sc?!13VM=n38r-Ur&?&frc! zasOWio(0YV_W)NP!#M_UD)>4&0-gQ81UwSR?_YX@+XIcq-vY%5c>c}P!Cyud9cG%V z*|Sw~DHXyZ%4&0~WCsddD?CbPekNRGKixK#gWq}gS_2zsx3v`7mcG;E2|cnuRz9erJkn2?&Y0sQ$oZAy*`C5p?3jA@ zw2IEfh0@6wmGy;83$Y=#lZF^;l47tv!|5Qi+ z-W(ci`d4`la}No&6P`zH17c=YdSWk70lUzXMg6nRX61*|n_Ax_jwsK9Ro4g3Z=&#* zF&28HsjNLzAPjrpnpBt!6kA=>nkDp5s!|wHVXdrHs&BRD8Kv5lCK&KE+9R?$X zB$vP;+a_$c&NLb|%q3%}l8ZfOyMOR-jpP)brQ~amq9#PuP1(Av&aqOig_r5t=8E=8 zXHxan_R0D2ah-KspQ#uddY=8OVk+tueZmr>y4T&rtNyR(D>W`F+FBPYsBcSeNZ%gI ztLXb4U6UnHP+Z+9k}YXn^-~V&hKJ26I`V8)o}Y{nep8$(EbzQprX33`>itk`aBOey z<&-d;>toH&fse!+CFo?;ud0YX%dFIMZZwQ`NU13IPEVUrt2o`XQ&WP=HKoy`Jg z^Ebo8+b`a=ZO4Z5t=}5!w`|$Ap|N4xg`0+l>xv^nP}w=T8SUp@b2~*FT48gjqV7pd zfK9FODtolmfpy`IYF$6uV~&SRBns&|xIGIM(G0_p(_tf|@nGiUyYx`1bsJn$o(*Y3 z;WM+o(|^)_b>MjSue1V}NRZ7+YoppZRzJX*)K)FHS=6E|6#~(i6gQ1SQ%w_7+DP2W z4h+!wxYgyGSA2^}lepkmA#ldow`|z5m=A?&BKt+}4zmwtWvmvH0&>iJv6JQ^!14vg0?k% zQ!#2(57|OqXj?LFh3B!&8O=pWF?#iQ;X9)y#YZn4g>U$v2?q3-$E8abEM}{b!@D0(Ak)#jIc(oYs+rFEo1@dpf&GS zONx8!e%iz8P{>UR$ygIvflC!Bq|LQe)gB*|;PIe*&@)MCCreNvlk%I#i46;37)GX1 zS58Wig>6%zvDlK>66TuhW`#^v#G+Psc5aNy7GT!*wjwJ-@d(|{lf^d*`E*xCn@Fz5 z&*S=gU}3Xjjbe2-x0?Bp5$t+}3|DaXBMRp1Q?3|Sq_+;ROI8`=s=FJYo2yq>X!KT~ zuPEe}WfT6uI2Q~buGoz$td#1QXPRPY6E!lvjcf^S!(Ap{EjJ?K=#ap`Eoe^_p;qT- zVkgc`oBU7MAd598`~SFve?Bin?*EIR1Nc3DzY*F0>0ko<5x6(_LvRgp|DSUeD`~kQt_!@Hl8^IfZ^Z@68A#gKL0l!1Gm+b#!unk-YHiG{_ zrvC)^7w|#w0q{Z~+5ZsO2hIe408-gsF#r#NRbUVdfNvn*zXa?6y65l1$nqZoSAq9{ z{a_1_yssF5_XM9tu74+Z2Y4+|48WD(*5LQRw~+B)51s~|3TDB5!JWVz!Bxj_9soQa zJO!K%?hEb)ZVkSIyf2-A&I~*m%z$ao2J$OVOu(;8SHq__Z>oVWdJ9B`y~IFwCv@fp zvJ=nHP?)=#ehgK&wx+i8M+|wkgr<{q{Oq8roScw55B&FKbM7dIb8<$vq3%7W@n{R% z^IRTKii7-26wbegd1Jrgo9xlvE;T2+X1KUr#P`UQGv!?XZv929O0(sd(-=%i1L+M$(u^h{Huy znOJ=xE^cBpuU?mrq!RV05PCuzxkl7-WKBgqJv-lW9?p2gn8NiSS-Ql^(W@(%#8FOB z6*NsN`XhcaL;G}lY+`&qT*{EpYFXebPG`DjcvuBUOs1MmZd~HDMR6L+kHhV5PJ$ub z>PoDHe)ACD6E1l=5Z^{r&8WgvNMccpPOF%1hh*Ct8qPXmxR70G0WZ);aLs`VVfl43 z0e%%~m(Nm`N0x1EshkRuOw(RYWRZzEGzOPSwW^9oit;AJI!xg;B+sF|fvRfnP8W@4 zlO`HF=_2bBk!9$1sWxa=$)zn(H4FxQR>-W@2!CXfMU_vO-toJz2h@ZAf`(yX(;h>i>I7iV%v9jC9B!-)|NXYQRI&55>a#;7KXP7{4ORE=#=`_#0;EC?fQ>y9l& zkby5yMr!tgUa}S^`)Af?bawbV4k{TyQd5M8iKX@zjz`3BePt&g>f0U5Z_F{{N5);AgieX^QV z!w@E%A?o9zT02SY1TM8AVwOlQ5E>hS(`TUZiV9(^PN*`;MYn1-ClxzTPGUIRX%JQ` zzR`powj&ymF$&S-;NEyG1skgSMAq4beTNtEYuagp$?5-a*z zpOD0wxLBHC2%#z?Zd)1QU}udvstq)6URD7!H?oPx)D4;J8Z#^Fc1K>t%2m;#Y?Zle zzA@$DWNc=tEawdyw`{4EOh0))rAv!ycO5M+I{)87E}KWLl>C1szw}Y;|964E2Q6?q z_%`zX^T8NsflGj5|E&a{Lf-!u@Lcfc;ITk{0c$}8{1-C+r@>>vV?Y}`638DwcL1CM z?hk$sdOu?_XIu_w7@2CF3{Zo_W;rpC_dn)z*XSg;0kaMTn=p(5ycj$Y%z}%+ zY2Y3}x`h8mSMZ01JS@n zV2)?L20(aWb1d%%iIzbe5S-ca3AujH4lm>@ODWg+w)|EbQ^-!21xBn(_+OZ4O}dsY zlhsSP6!K8z%x^j7+J}Oo|CVTGHYS^wFPI=hm903t)YTRUZ?G&-KC??4i2HoShv7eM zSNL6-r@?lZob)kf!1ccQw!&V#)AYx?rpqY{t9qJ+UDy^giSM~naWJsPXlgZ`J!nrq zs!1^8lh>^EQOo<<;z>0cy9eU&+pNZR?zn-Te`=wj)w$LqiT6jc-gz!dH_&aXi&kNo zm*p*&)~ce>>#9MI{#mBjUT^d)t5?MXmmOHr9z9n|;PqW1r^T}fXAO^&2})wA4N!GY z6E(A$z+0UPrbKPWFfq1|#$vEYMj{=^xaRmJZk@6Q<&;n$y&lyu0xu@=Rvlr@dgZ4n z=LU`e>3pVwJ~ISSiV|n+b)O&cX-u>j_Y}WZ6S_So;|vfdI%#KLdHSx$`5R%0OLFs9x}GbKa3N*a zh1an-PqXCxyaGunjxBdUJ9E7AL~(T8qD z0vE4u=g}99_2HTN8{Q@GQZ<`tJ&kJWE`LXItfi}l4z!wsTs)%00!&a@9zcUU?unlGQNBOt^|ib2T1n+DYE^$!6>*NI1bzmd>A?Y z&ER$5*+B9C&H*QYUm&-?7yJ#l95g`!IE zNPhnz^7#+Im%uZ@qk(+=9|VTL>EL(B*Bm0GET+;4{eL zZv}4wviCn6+yUsWzmEXj_csL|3C;!g1V2XB{xDD+z>C1mKm~jl+4@7^pMm7<3xMMP zpA8fjP-Acn_~6Y7)RoN*O#&7%nWNmDm6;GGmHk>W;RMD}rn;+y<6AzJqMkJThxZBQ zo26$1>Q9vV!M`QZ{fipULxUQ2@W_DJGA<_|V^3Tqy$mxFmbj%{4Wz0Z!{>TV%)Xiw zw%F)823N#{g>&8%_@G*>PjqBh>F?=Xu7yDkdT4WU-s6PS#s)UGQ4ZoA6yGBF9N%*7 zTzqh-Zg=7xVEdU>T$G{enk&}^DT`$;CgZB6HS+1n{y($2Hg{(5f_Je+S$vXOR;+d{ z>$vQ_=Ir>qwmWQ*ezu`HZ0#S;%art%9FvN#q6bw+A-k!qVW1Ze?Ue5!S<)po6U(z} zL-C?=J}l^Sv>&Jr`fP3o*DgFuMBH{1H*py@ccnl|&~EX5T2DOEgl(9ZTk!27HK7b-mtKrQrtbw@@hIq`%ha*+)&^1K%U<6AvG%b{R{P}cD+nGSRoA?!l3Bj{_U zXz=>NLJ+>ge&;8M6}e6p4lzhjuMxK85$7mT5zYV6W(BDgmT`K_GTznoVyt*JgoG9( z4o-G#W{MSSVrG(%Ri)M_tJp(vb&{oWa)O{gRf-LwMyx*#O&T*Ay8vTaGg}c^=;}VL zwHWPOelaQq){FItYs3;OkZJptk-chRZPJjG)+cdh8_KTdE^5_mM0^@Dvx*WnG#H(lhSv;2as>&CIin7%w)Wc48asn*Oq;Q&FB!;D1i%;fa zop)kI?cPj?r<^B5qAz6;8G~$x&IYo)Kj$x&xA}@H!_+Qq*q)OGp-WmB@>dXl zd5ha|HZD%o-!Y$L(~o&14fRKz`c8YNSI$Y|C@lI*vCm^c{#2`VW+J*i8H$vDxSm%H zjhLoN(|!mqf<6pXCc$NS}Ho?6mRlC@PDv-2PTw`#F7{@-l5364gqd^qZ`T zwh@FChqtc_?q6kV{>H_$%gNqW)3AWXBlgJx+Sh{S`v)tUN@ItfYFV^sXQ5bnSslY7 zPAh}$OxLj>0*`xes?6Vl1e(;Sbz(d(^@=gtHreG~+1$fTf^#9lhDA5Bg2ReT4q0=8 zWHZj8IypS;mzbrc-A=SIJ>A{N|9*nZuQLG80*?n%;B0U#kS^fs;A=oJ0i_RE1#}+ZUO@K(T#XLk zQ9&Q@V`TpCfPV$A1+M|G1|#5(;11wwsUrc7T(>7m?wkd_T+I_Xg_Ir-4uL_vK&+=w81QfbRPH zD0m+TTc${xb6w3zq?zja(NR2xi1RWTF20mpu=w6ubolp8(UhutTaz=8tDU?LGlc15 zVTkOcw?&~+Kr4chL%H$LdlO%3BYD<3^W*dx-r@L2m;_s!ZIrc=Mp^jG%HY1mFrTos zFmEi*csh2fqthcMv|4fD9O^EX%fMB{sl*~0yjDko*DjNC|3tHT?u8E(N2Yz!BO!(0 zRTlKgqiMe#hPeU^3=a=g^L5G=Ns~2oxTYhH%@&D~ z$$8GB67OYhFPYNri`rAo7$(&oY;6(;Boyh93wsx8Q?ys9$?RXWY#WEH89IV%RurGd zol>sD!}0vDq7P6v_kHmAC-^gaDa9-~knMieH1p_%79Yj6=c<_wcaB&O0*Jl~QprwF;YWKNmx~ zW&z~^47*Jd(TA2zg1C6-B#VR0Vb22+adA@(UU8g$C)7j7%mxa@Pej11CyZkkU3g3w zy8*ye45KZ1YN8yXsph)pSF7YB9R{&A+Vgb+v@HF)u6@y)F`SGGGr4_!(Q<97FIBE3=vl6+O`WW}rl&Yq z4w)qtPcHZktE1H79DA`!Z|Co{?>0H7Cx3}DrP-jd&_aNFW~Vve$hvm`3RT}o$r7`~ z#c+x^gm#Z{jQKXm;->4Sg6`Aw7`NK%L)CwksMHQw6-GRD0p2jGj#_Xw&WY5gw_KO)9 z!I53@k#RjIxKDrjO6y@YBGOeBA4NNGh{;l~o{w6dU<~MiWYdK^hmB{5iR5Mq^PGRr z^6Mc+CyqkHY)7q5n|q!!c0~JizTDC>%c`U-9d2`)V4}H-d$+?%kWB=I60*Z)9A(mu zm>fByPdp^B`io#ZlWND&n&vXl1u$re{o1*&$6F3}D~$gM>ap^FJFK?QmK2Dg(1TFEUFbg&s5Wp6wp zGm_8m5dZ&V#N0O`hsyqcF~9Wr2jqPD{!fAvz~_$;;2{wZYcsX+XG-w0e`}YSRp7H-NGW`p{IB0-)*8jfz{X67)+5E2tF92iU;b1kW zga1L^|0H-1(A|Hs`~M~Q3n07ySCRKW0R90S0^7iaU@Q17vi~c9;{RU=6vOXd!6(26 zz!l)W;QPq>-veI+e*q2w+58pH?`EI^euw=3Rxk@51^yrS5i7}CyX>1EZe9~Cbr@i6JF~sgd0iV4ZKOlPsnNyfBuaR8v_c@SdUZS5 zTP{TFm?jWbXJ(pp&t=qfi^XziVyo9Dyb%pSf&J*YM|1E{MgL`{#MBo$^%QVou?2^jE!7_}aq31p>kCD#o}cB|2MR((B=zJq-AQkQ>Z$Bc+*p@y!@9^6TJGv?eLUB?f=v?s zv4-K8o0nPZ_FNiHhbvJr47Q%=vq%L!3vO!Wg8BVkNl%Z>i``sLwP8_0oq&QcKcq4?KqU{ zi7h7O8Tvoo%14w&Zi(%jNpNh}_iD$olCU9IkCcQQbn*01YjKFW^8MHcfTdo4igUJlB~k@ktmym}N@?1~cLfkyBeLtCoL5;(j~SF@8pN21`)E09+>65CfNkP)N~p1CGpjWSzUFlwTR!VGKlZCzzfm@<9hkY;bxyXC_wLmb8ORN&dwKhC18t|9cu@>}klQlK)%$(&zig{9gyJ2M54@Fb^IKeva(_ zGw@3A81N_HQlQv?CxY*x6L>v%8h9wUH~2DofOiAM1H2q8fN?MeP6oFGA3!hg6!2uQ z4%ERa@C9@MZvu}14+29#x`E#U#Q=N~(0PD+fV+c#L-tSg0?*_BS@2+REckEa|L=lp zz~{hg!BfGdU<+6QevD4wYv3K=?ck|EJ_9?zL%;)o{0Hs_zKfpV>0laM0@i{xU^VzP z`T^Ym@KSIdkiOs-=m*{f-U;3TbcTO3I3MT^L7mTkJ9s>J9C$3y_`DTeL4)6Sy}2(z zrl*ZUB;3rbjjZ{sX&#y+sj-fR#YB6V>DVM@a*k|z*7lR0Ix_98W&Z+k1e(TkjpPtz zwix%x`t>P-N}N{tTTM~t9JDh>QHDG^5(xPqAa%w_%F9{HoJH-sRC~FfVkKmkwBkrS(d>P3 z9o=&h^6&C{v*bbZpf{ot!e?>$4Ij1cd0NdU>b3BwO3c`qXz6@hvMO0j4*A(cB|xb0 zvK~^&x|ikNkKk%OG?4ks^j%zTlg~yoaEfVbvhS`%a}t`*R<{1!U%YKu9r_V|odkM2 zx%B7d!i>DJySa4nEzUbG$xxL%5{z}1$dj^?QU9z3$NyF+E1jVB|2!l^5W5b`+$R73 zam3PZAzMoRe+0ku`D^6-=YWgB!@-5%mf+RM_w!&5jDXvKZzJD-3w#dfKEMSa-~T@b zzeBG77w}f_Ch%x*B{&2W^Y1F;`}csygU5jdum*e@8UMxLMc_}t$>6TwF5nj6ZOHjA z2l5Me3Rnexjcl)b{uBe?b>Ow&HDDk318`Sx8*ml!z0Uuy0jt4%!KaYppA1e1w*$8T z@!bELk>ei)?gfqoA4P`0608MZMrQvw_!xLESOBvBe+N1J+u$1Tr$GMyIu}p_E5HMQ z`W)MTWi@}_7krw(uLeO}C=o_s$7_lbFfE&0v1CHN#3~}eGKjWu5$jV($I%RCvSQ_V zaUmZwpM*@Jfus&e)^usm-s}gj zsH&!jYsGuFmJ8Q#s#9-RpdIVG95BDY_=vz}b%@PR6ykO@^>pMUglIyUT4p`$g{tedGvM90 zmyyH?Sd<^x`(;$f%TvCqSFb(QFS00<_9Gj;+>pL$URRGltU0Sm?m^x(jsxywZCQ4z ztEt>-Ul{`x+D=yw(eLa9-*L4g*S#W%7Cd;Zwz>1cBuVCm?yXD`cP(Xv*R-0(0Bd#q z;nB6Cid)5A&pYK}9br{feQbWk%?0=L*x(*cC5sFfCi=AREoUZM=!c=8#Zp=N{9m4L zizv^#=&GnZ4}96RFa$YGmEEb6)pfBwY!SNea~tbn`dNfTZUi$rTdfirziCp(rLSu> zyJ=G6S6|VBwfv;cNb6|N>qQK_UF@jje+y~ql zd;qy#cK%m^VW9H>e+(W1Wb6Mb_%e7I7y+k)2ZGbU@!%84_WuCh1ZIHl`TH%h{cpgp zf$sbJEAUKkK2Tf$+4>b5;N!^qvh%+QDE8k1xD4C}L_I(o+5bGS2`E0mPmuqg3r4}+ zfMNpt9DEz-j=v{^KLHO01K_^kx5)WdftP`U;GsZg1FGQG;QNe&#^TdKwig-c6aF0= zIM}&-=+x0e)r0QncL=?vDFHm$QVA?;b>eeTAp&+Oe=Rs;8a$Kth_S#N78#J4~LVcyZ6W)!DsvDBPQ*th3^? zuh-;Pn8s)5#dZ&z15XD}b-~0r`nA=6o#(%H`LFH#^0&>okvdE@=Nj_4sdX+V+^csF z{*Trc;{Z#pqP;ezqpO=!qxG?6#M~dnL`FBqoE!ADgI zb}4#P>CpSL>h-o0Wg?p~o(S&V>>_pja7`4vrsjUbu zFs9GBG6Ei>)}JJA>QgSxY=E& zVKjRl3bLWZn>xE`Ggu0!T%!GrS;=)gloYhOxlT%)xGZJ8RyYyeMg>VT1@D76Jl^W1 zvrFWbnA>oEXdo}jh2`aGD!-}IrFoS;&WYl6Jw*C};ev9W}RYYju#!v!D z*~zR$Udif?@PBq5U%gi9I-ejh@wMr;Vophy`4`uHccta@(N(4T_R~$amzMg-?FO*} zZ!#NX3ZtNR3ueCITxS(HjSx#N{KtCEl2q8FOF`-C)g7)7uV_O~4n^kHf}S>lpwx;h zU>3rphlN0Xr)Gh$BD4%L;)}Va+PaA6Izejcix>&AWz&zB9R*7P6Ccl2tKRbVf?~^i z8w+fC?^?Ijbe11;f=cwJ%W7PfSEAOy?7Vuo~C~yHd1>6n% z8oB>V;2q%Y;BP;Dpb2|OEY1DnD5;8x&U$o}61?+5P#&jF7D z>%p-=cLLlTybSq&7mz+c{{D5a3fvrgjsBO5Q87)Q+6|GYGO8qH3?E&i5oBa9q?svC+NPes?SBJoBtoUVt_-B16Mq< zeZ@fiQhY0G$WbjKkGWW0JO22wDO_E2ATgxNPd3JHcJIR!g9tUkZFAUasspPAR_YSA z{T20;1jl%M($>1-0m_&O=?@Mx28OsynH+pqEEfS z+}vaZ4;=JHAR5GLNn6|jn<@TQ9;#l!jRK~BP#Sap^J_~-`7W_Kd3M!|qCv3dK^{}hvIg}?n|+t0 zR}Fe3)#D^pT<^Ne58sfsqT=-lmD7n{F%%8Occ7*mgRR*2{3tXM5($H+uSFvHi@NUJ zGLeX2;qlN9F+NgH`8OS%EDmCOP!r|qsMR zB?$7VGPzgg%VS|49`$87AJWh%14UWh86M)3pQ!eWjYOQbHauS5`GpU!EZ zR*GsW0%WqKaZs(n+U|O7Ys#H@;P*o6Wy#o49?pVVF8Q(t^whiSiD1pa>plv!B<2rk z@42gM*p(NrW(=;H25{B;r3SlxC8@IvQU!h>Hqdos%A(NheliHr74jT*WC0hJf8Dz* ze>ZeoA1@Z2f|Jf;axL|`uOKP6C34DF$RM@AuZ-ZTEUJvA(J(ngDEf#!Y>>uno@>@r z>}mL6&&}vr`XXa0a}#CZMZLec(No!C^*m*bSv}Vz5m&ThDlfPsFko+%8z~2Y^|HFfg64&Wb=*{=lWgY&>9a2&W3 z_#gWCvp{`)4d9-?%DcfUMZ%&Zo!BR6k!Fg*?M%;(A`zHR>pC-^4fdU|QqE3x5ROc0 zH(9sVIh;}ny5Gsa@GGcB-E=o-)}vS*pQsH#nzqoGBpmI?LXECgd0{3d))#GMippf( zk6T+)=e6cGbK`YT6!_*5bk~P@p5?gev>8>KlNM9XR_+5_kCNr)weWvyD^!scD~F9w zbdN*-rf%1}#>;H16>3Q-7rjjrIB(I8CvHNdDO1JRr5_n9uJ=;|t+IcO@MI(5)mc_WTjDjI^)x zT~L5k)ZzLRIIOXOpu}URn;0EBbL|>7J>2Fy(hW`V|B3aJ#tu~pr=K@0aB7|#dm-0a zu46}Jq0trey9anslHJIvm?X@7I~_7`$?k#u9N5r_jDbt4tE)Bn^aR(QobAnuiv=mi zN_-lBA18`yR?pe6$~IEDP>kSAp)={+{lKy^g))T{Q=^0-whkYknMYjmyUtbr>iX@Q zUHtJxJ&M>nH#gH6TD^LlU}E!o>V%S7y<^*ksn!0%YX0ZS!PO(DoU(T0q>*N8bkFGN zYfjqJeBg->Y@IeXIyTlkWzQ+6j-GtlDb2N~v`#A&&@vB?>iRjB*FE!devfDF-xQmf z^){X0rPh~dQq9~+P(978tX7ZnTL`KX<88uL@1B_^zR#Y?1+KPci`Sh7Ia{sLxO^io zzB>Y482mcF3+)IaUe^hX^JaA62^RK6$2Q1)$Ye|$@KBIT4y6(a{T#&(6|GO4 zXWsETU!s*xs%)DKUJ;X{J%BY`G26*?NMWSI=&hd`-8nUJZuIXN>KZ3BhO@eMynBR1 z3!P=@K=lfA&`ku3XYQ2KGxsoQre_+&u5iG^e$p0hS$3) z>)+-}Lbe?7O%JHboA zi@+Qh0rCMj9^49i7rFo6fph{N10Mx%0QMCxZV$?!Ovb4%UD>gO4NsKOgJ>w+A0Z*4KIe1E31Nhn)X0 z@FZ{^I1Bs{_zd#?$H0@o6TuUJ;`*Hp?hTFsvh%+UJO}(0cn}x@(g}PV9l*!HM}d6* zcYue0TY+1G@1g_v4)`E=377_Lum#)(ybnEq^aHvJ@HRmG{x$Fk@XtW^0@x>ACypy! zZ6WccgIA4R1JZXw9fF3P7Mq0}i~lJe#!)X>6!DB9w{Vjrz`fgC%bA^o9&Wd)=b)b0 zSAJo$%aOA!5gPlpnIYsH&JJM{J1A=H^mo+GR*H2G4RTH}_+80K!7}w5I>+n;XS7=7 zPSr7^<19#sh0erRHY!Vleo^((8iQI}r`zEuVjl(v-aMB=Rb221ObgmdTko5xo?hQVS-pnV3SPggNlC(%-(MMu}f;8b5YhTT#|3%dMor{ry5d+ zTIPvD$BqKys>Rc98syru2hn_cP|yuoewvU`weu7v=X^99|F19*rk zO6oLj?d<%B&eqmFFZPC=exGFYH&%zIZp&U0p_|3q!RUGwZMTWaO9sw@vImQs--*q zX8BqXJc7MGRD7tqEvVacC6oMWk~EpVaFH}Nls!Ahdb-BQslECHj%bN-$@}#9=Il5< zdHnJFI4?Zjajip<#a%kPpuNlnsb_|;HA!UjEUiw%gmEe}LUi8r$ZAtD`L&yLx{+n6t5R&Bk|dok&DdoqZj6rbgtBeN4T7#Or@ zqgZ80fUEP2tPixVDS~^RtS9SD@XYduEXWhNpg+8I;v`6{tT001} zGm%A$4-TnxBv-I%!5EGKfA&vYUY}VQRu^-99G5h)aaAL85ZZMSjAw{E;W|mP+Jwo} zh>5w^{=WkTOtxsr{}1DrKHoyVe+_shQ0zb1{;xuwe-V(c z|CQiOa98jfDS#UA92Y3fE{RB7(d=(k~E8uGIGN2g$@jk%K{QVo`dD;7)4EBKy z;OEHnuKA9rZz8Y11Y8Op3D$r+0r~5H709mtNU$4R2v&gm zfL|i3|1bCjkk9@*!Co*9?f`BNjscS4p9`J0KP}xeh5guS3d`?)K{sD5VTyKPYCh#J|pF4-*>{CfgYG6T6U25;(~2P!mNyS z1AY>=-q}A6*Ad@er0a;k4+S@lumSAB`zBlU(fO&F8cXh&uSUIl zeNUm=WeqC#bt+lJ@v7F!rtk~l2ef8Ol2$71Zyt%QUSyM1tE>pDf2!($ZxjhbvkNB4 z^U+R$HAduu(4w0nC;aeT0KbzzgOg zSl#Ku_7>@W7(uoLA2ZEITXyx!^|Bfdi062dDyk8RFqF3N`+>J3f5K)pMo{=`Qw z9R)Ew@$2M1G+oIB%Vu=E!w*>1Qq@Vasieuue;+5cb_k#_GL2aMR;ZI{`1H_=;l`-zemh=Bd9upAblyxASoM9t_fy zucP=~sH-~jBO`bf7cyLdFYt=NJlk@`2sht40Gn22kgINqX8P2tE2PF*ftI3>Tb_^q zbzE#Bhf?jPan~MqdyxC%ZmeR2K2Qi}sKt!qX9 zr62jYbT7$D5jlb4dtR=s(u)__V?}4A5;=~6U} z9@s|NWEEEMBHs~>q;dV|>}4+UP9bx;XO5wW5^I)!l$+$obfUR1S7bV}1+2TX%Nb|wQJ=*94aHnaJvIQ4%nCbU zwL&u`{8_D?xpwP0x&Fd3S%hJ;%VY`_N;aYu#6u444wWcWtU!&D|IbGP`!aHA8vjrB z|33xK08a;d!T$qaK;HigpxFO=z%KABfj5JH z1TO_=fs??^!K=^{>uwU6#wr&;P$}A zf?|`8XaULFMF8zPd?fb-8rC^IziJ{g!8NwFrnYm_YQG65L!`a4H3KF_ICZmkdNh|| zI`gQtapZ5<_0o+-O+`Abn*0?XH%Br zOzocsEUJEOwtQ0ODs~F=y>C(>LQsxz3 z^NP9EVwZ*7y9L#YHm+ZvDTqoMBradbzY7YVRCw+0a6vq;AnsG~+>INDiMEZAP9K@a zOg$CHFD&Avg9s(B`!LofX(+qAr?hRlFTFXChA&(L+oX9;)D3wKWKD|-qf+bLWqgBg zS6Sj1bAj6xvs%<{sJ;>1>yy=_no{cbD0FF(i{hU(ClxAC90A;w24S@vUnk74f}5(%!z^5#@zt%_S=~@+S!;MV;Mijx z`YGWX7v$|o3iNrY`chG5*YtjDK4I8>)X={8;hLra4n|TI1EXA&J#?u{81hb=48E0G;{> zM!XtxRF2zu^Rfz9Ko87gF?D1A^aQ~OR@UuGt-NS8L1iwRZ>-(nWUM)t=e%L#mMt|b zfO{3{qBGr)?dos-#DQ>ooNkoE|qIUsm0= zGgpw1*CH|Im8Cj94!?C?O;3)m&dkBJinD5JYG5(vkVl}yz}iKA#;GDH7jGe6%nQ$iZCvDDYCK3 z*_%g$0g8F1}e2!GRJL5fwJY64m!s89U;~E2^UlXdgnPS>N2l(o<2! zy|}vKN?`hj5YK`I$!Ik^OUc(BMHbL@4mR_Wc8G2Xl?uLj_AxnJ5ED*DP;=PVDUh!xbgr8ig)v#zb^`yO4BCD1LUir$!+^@@8} z;Dpr;51Vby$W=svf}lLCDnybk_(*1%b}V$N_d~J4vAvR@1fn z@Yn6V)6>$D7#U}ktjec~5}Xq$t=2Vl!wTZA&QDCvt&;HFS!j=Ne`*^wecrtx_Vn;D z-kN9}hli`R5SM~?{h28uj3*@-zY2w!QEL21@3`fqQG(thn;gavtL#ftnaNYR&*zkt#U(X8Fe`3})@3@dVoIwA3?T%8h9#r zEO-ofAh-j#JIZ{U;Qu|RhM zTmmitze4u^8u$!f;B*~0Y8a8;7`Fbz~jKF;P=7X(F;t1)4-`f zz6Ey#zeF$a3h*KzAA=^?2{wV7gHNIxxElN|*a>!kv%o4KKZ84iTY;aWBls+MCD7f0 zhrkTj4Ne930Cxws0=hroliAxV6e zn4RJyRg1+jJLDU3Mt^O5PfVs6)km^mR7&QGtL~U})Qro*u%_ZLj zbFl?DNFMh%qd6EV9&1|*E%DvNgqQf$Mmr$r>BD9-kL?<7JwK7yz}`N!(9k0yg(Is#N4V59<-*|lALYRrfFM0*0hD01y=}JUHddTu2U{FHE#1-CC3#K=m^0 zp2b+o@)_qk3#zY^=ON8YHM&(GwYD*Ep$(%&|gVdl5$;%KxpoQp=TsuVR2VX9eaLYTAktI za;)XVXkNXp=sQYuc6WuktX)}CIq5{4thjrORWo~-Mk6Rgq(n^)%=xBQ&;!WBJvzUt z3SE0j+!|PY#wNz+!v)ON^p@!e%CcZ_I@3MF!zw@$G*r`^pPWOrT%3lipKv3r^u*25 zb`;XBuBb-w!$TZVuyP%UZ^L(HOR%#kdfDhyHixhkW*s(MiCO8lFLb??BbgabZaTsD z6l&M~ELk2|w!v`HJE!#2ShZ;{Cpx-`Ig|;PO6#hMM~d<$#5zpjLL@h$yn!0O-kmNQ z%_fbm#*sIs=+ah?f{y^!v$B6%qJvGu?vLuq z8dp?=+KTF8{J*;(D@y*K;g>#NLDqi{crkbqXn>Q!oxm@U_vHuh0q|<@EN}&Afo zI39cf`Tz6aE#NueO0Wy81FONkz&*h|z}>;`kpF)Ot^)4??*`Hr{5hBbkL16uoy`65K!@NsW09K*ly6-}o!yDxc z5WN~>eaX^w<7z`sCc(|u-TA`kiTvUP#0OqNmNn#yQ1&Or$-)1Lp^P$zSFz7N6ty&utA3cVxPx?JgcJaEztDh zaW3;=%98ddFbuLMrcEI&^QlYCr&jmm_>UI1--FXvk(w%jnTVqzKTth}OF|*Tq{^$U zEk5U7LeHgs5k4NGKL)(8wP(5cTXpdtDeFUt?TTs?KN(Y$R?%klV)Y5nLw&-3OI5{_ zcvVETl~}gsZ(Mb|&IXw0QdlBmcO(mFUkjQqIIL(Y(L8D28+c z42=HxUVkZ?l}VsUP24VVU-GIlMogn6A`&2fj@>Npl1 zxm;PKQFF@=)ZzXn`9i!sShSn5z9AdQ( zD4fO=T8Ybopx!0v_t1isaw+7&ydm1TTu(x-eP|p$$;3NT$kG++I!{ z<{P8m4C(7jiOURXWQKSkt*N5n)pY8h zJ^iRA7ulY?hD)uhd}1OT7j=rP(bzrEr5UvY$BE{;)+34chyPT*A<5DW^eu7GDlGG| zgv7K~6_wjngC6~}OtHP*=vh{;iifceYSA7&S4!aZT_UH&vl#GqkJBMW$)ILhz{(7Z zV5aaE%6jJw@Mtrqs#pwxEh6O(WQLE$@uHe>;qEpOaaXNmf`Y3?{CB$b6PTEKU@wTuR8i{5$KEE)y z{4^-uvd8&lhT@@HquCe6+BZrM^VE{FbxUn2eb?jsjch|Iz0&pE&7!)`reSL%%}2^` zyJmdX)1;=;MU!@Yv50N7D_yZH1se!8DRzoiCHZ=(29??(`TsnKPx0p@|35MajK4$9 z|1o$4&|Lse0b9WqpcsI61|LDbe-k(W_Jet_9aO>Hz;}`BzXP5Q)`M>&%YPVr2uQ}4 z&wmwsSF${K5_mLdgNFgh_{RXr_)iDp;I7~<;Pc4k*MQG~Cxc6XVg)E3;H|;GA)|i) zOoMUoP@ppbcL9nG_*U>1@MiElunqi9vO4%a_)qXmumSuG8U20Wo#3V5N^l5V0Zsv5 zL011VcqV9rDe!RcFrb)#??Fx<2Z|Z+Gi362ffm>Zen!8m?|1R{PH-;xU;h6F_#*h= z&9u8~<>#>2{L~%IY-E{;to26=k*x4dH<2Q$Gm74CYq>4cZwMvz{EP z9}JStpr*rMMp=U~_uRH_2KnK{ zHHcGtfkC$V1iSmu?trY9x_cn|>bmKIki9tNa1uuckM4>UuJVH}E_0=mxES2`>ech74=9WjS{oU47ts4n&M?G8p3-^FriC&! zi*uG=51}{l6dJCMPPaO30)`Wc4kZs&lS__D7UJKW3>cOw;8 z&5dB!v%V)$-G~~_!v{3Xj#%liu3&o9?FYW3h{ImOo2}1PW(-ei*@G+UfYQQ(TV07b zVtt}x#7cip@47bjq0PzpppG3Ih)Uh~7J=>fmaFXIgG2e)*>Db38P6atv53WF+=;bD z3cc+AGplR!kt?f#vtlz{a`s-gHhI|EKb)6YQXa=wId`Lu3hU^NWnOGL9guShvfV188*87<57rp`&QQ$xw8&RK4Qc$`kb2NK0JYHn)g7AmMkqP) zgxS*1#82j0d`m~ww4!7yyY~g!;SiccCXm4D#NgJU5A|3pk8>T$LkHG0M~S#-L3+$a zL28A3guzgkid!kZ7%QHQO57tt6>+znn_w5Aq}U%`M`abCh{v(rhw4sFa6JtMG224S z@Z$BSp-E#h4KMs()6G>*+4Y~$B9oTIo|6jm>yariQ^1osM>Z90y zPY0)f-yqAs7)*mU7z8JPdxJj&UqZHjF&GCm@L$OEKLY;=z6f3mt^{X+TYzK0+mY*E z16~b=!D--^$n~!T7XjV(_hsbxkAaVZd2k}{vNtOkord#OEncgA+ zib9r6SWzl_2ko&hHa-3E12eJCmxq=4uCMpK_`1#~abl>3xp!>hGAwU^q=KFWmpF6xJcm;(Y4{;2K~J?{Q$n5Sg;QKhl(Pddam<6@pj_!5pXxX3f!`CQ zdakUV8SjY--z8>W3SbpO4$Jm^%q_h;KW%joUA~O!xZ*)i0&O44#$Cpv;-!{n9k7ut9&3=aj=I-!ksgR zAq&O9@o`p3FgX=vm~Z#?uO8*@<$1Z+!9t><;)n4#e2?f{dN*v9F*mOqN{W)!``xrQD%VAB8 zqj_MOUq|FmMqN>)T^@Y^F{3A(=Lq_=!ezckMQejO*rQb2v4yTC`MlKVz!v0EyTp2l z^rCjj^zPD=-Vo8{)K_c|?dzotKnVe*>w*Y(Qq*Zl^(>_mgd(eJkk$82j4P;a@zt`-iWA z)Vp>9J01b*U*Vc7KJyCX6{a_hHV#Ywm(?cEr_*nRj38x%u#00XHj%W$K|9Tz@nr7w zZCz!|v}7}!3+#=$cPv{@a}w#Lx_)$&h!k8oGAUM@+sLEISx{7IKDk;CR1QDD)iDh%deazqp%@NccI64Sm zTU~pxHDl4xIQw~gG-}3evS_0{BM&J)(NuB3j>&E+qj1Lc2uM@C<+t*LP*|q2cXG$g zwPIvx-Jwlm6QZK8KQ=N@OT1rALZl5%PC>>Vn{YY8(4+dZo(q>eaBLUl7ppPtzNkIb zj5m1NgRM>Oq+kT{eXDn&Mp3;=%}I@kxk%a8w5%CA-YecbIA3*2xegD)$~}{daD-xL`%@@^nrF<)&oCaQlgd_*2KwqV#ujKVC`mu z44zfonhLvN*D#lAJ;niKR-)8c5$me;Y|e+*4hOJeIC*bI$_ukZb3>HJg(A)uL`C=k zu^}9%M~YS0mB2XIK{LhmZ-N_Q zX=oRFUA83^%MmZMMFcbn)W=uGPIGNsuVV3mOY z_smWc3V?O*02GR+ub(mqS;9a~!DC2!0?+Rz$U(COxfhWg!}oiB|J>gZG_@0@~_H|Yw0esm0olK&@=O=X9c{C`Q1Q-6ru z{{!$|a5cCJTnc3O|0y#6SHWw6?g2Oi^2fPrx09*_n4mN>v z!Ck@E(HXoP#JdCT&EK~Nw*&7)SMcXRx`TUxW5Iu-8~7r41sDUX!O!V?_5b_8m%+!t z`@!o#AGZaFh&3&xOqgYTF2*0xzI=Z1T7kF<>fSYROw2>u`ZJDk9G7lI_b|^;SDhuB zrn?T9vx;78h&itxw2fkr1$>Y^2W_7kZ0;XVJp_{S!hT^QROD0pF4ecuK8m%8THVAf zhH2v1>Qz|cu{+{HO|Hwt<>HV;;gdXDDCR~@l`%To9N6~YGKq0 zb=a{~4XkH+RUzr`#hENGJnw?4K1Y$qu^~oqK2T#My7y3W~SM5U7mGT#XV{u920T9KH-gcCNTK6^)~aFRnjOA{|cOX z3nc8yOd8M0AQy-|-a{bQA zBb-J_e%H-x@+o?>wg+~AC>|X=RMCH!sN6BAmsh2)H!e*v;LL2Su@aMdxFn`)3DjV2 ztt~_ZjyPt{shMLznz(cbpil3vCHLC==Fm=kn`{-W(T!B1<_%aGS1+}Oh@i7`c$Z54 z-!3r|IaKohnEvMTF68`of*1qv@%;Trpcnwh0G(f- zgUi54;6$Kv0j~is18pz`Cc%Bd+tCBO4O|WufX)Xz0DKhr|JmSK;PK#ia2s%IpcsI! z2hRh42_6lq;77>+p9TL0UJPCY{suf0#4`b}Ko21O!1KUEz|YYETn(-QZvYp8lfg;g z{y=^O9{~RZUJWh;e*|s?G)7;#S*15)Rgf5)BJsYi54408{nTfZT|o4(=Bx}Wews&* z2b_Vm9j;Sd%~H9Vu*2-yPS3uE)Wc~Ndm-8R?X5;~I>j{Qx(4fjtKx&k!q@~)+_gz_ zpsS*MW5Bk${_x}BHz4P?SVve@Rb4ZpWZZk+;2z7!A`QpVpZ2}wjEz6^!%*-`P+9t1 zJG=XhdzW5ppDZ7WDl(V7HbdBwS+f*cDVl)bZ~q8&a~ylk^@ zxY@mV7oqz;*ZI_?EKK-CNTlq^BSOeMY_69Xok_|!P3nTsv&e|uG^yiRpyelZMp{RE zUT0@ABsmgzbb3~Hsy?T3R;weQ%_TLxk%}Z?>6y1Cx)GnsahE$f!g!Y+r`WkiRU%he zkxvNSrXy!^vrVQ)SB?5T?qYc)64BD#cdf;@C3PHKjVN_%QqAj1a9g&wQ^BpPzWsK( zUhJk(Z9HMCI@&^3S523dw7T9Xi(5;a$P0})0%=NJu!uWp?#o+ID(Oaa<@56ObQVVM z=mWh=db`OrH##IQ=bCHY5BtaE{eXKHE{iy!VX)M-DJFgd9kG&`=S~tQD-aVx!+}V#+N{!Gf77 z-8it)hew9w|FaM+-zi%&^8dyB(&t~0@jnhe2I4sY$^OTKuOQn$5j+Iwy#G1iXUOoc z0*?Tz!2Q8zk=_3Td^u_kcHn7I+Ado&Pt;?7s%T0&fGmfMWR{19U!M z6x4RgL#shW`++S&VZ7 zKJjO>5RRUtUftp6_>C%)Q#U1(Oz8SFe8sYgy5;GqxQbtQw3Mk2f_KYJD}0l1t$qWj z)4_E}NpV@`#<{rEXDh2MkX?&@He6AUG2f9G!FtzY**C(qh4m<``IedO4M$#aTm>{x^X8@wu?`L=}w2pmY*(0jy>Nh}m{QLyu(;VzRbFSAc6jKsvg7Ex%5(If(M z(;@Ufm*BRmY2X~rNofC9rj}WcW2dU?|7AvT`3>pFDBt&>Yftry@PH+O!|ea75aT|L z94h&LlwbP%7CHZ)zyeUbzXyZO;6xz#|I6T6Kr#Ph_a6iI0=ELcMfU$^pt%1Z0F&Sr z;27{u=8zzIOH0HrVZSMUk&k6<300i-K<7wCX>;5Oi6 z=mVY%9t$1=X2HY2Ex~uu3+R5oc`yxjfpft3(FZ7w|2M$Pz~jNg!5KiY0ACMY2by3T zSO-o3$AUY8I{=N%=YXb^J~C5u=J)uqs0MC%90$+-l_>AY`bo&V&R&g18NHnw5cQvo zkvVI3WKZJ$u4DD+r3Ou&;(ovIUq|Dr0b0r}962d@V_sS+eY*os}4_TG7VVRRJQFG6pSq`defzN{%51Ky83_l4lq? zjnzsIC7`NIax|+G8#AzKfEZV&o#G>*Ay3CU$-oY-O%eYVg(k!4-*gm3g7_yhlTB`9 zt0?z;NJGcy_5|}-*SGpyp$k!~7a4QIKilQu;s0I1rN(m}M_kGDk4`I$;u=!Xn}wQp zKN(n;wxv*!gN>on5BU+3ZQHhX2qhI=?uO1-d*bLJN+Zdshm!N9l1*nPS~0CrPITNG zI^%)7(`T&;JzQwQfC^npsUdt_A$&~727+6@UKV!}rQ<9x=8`Y_@iM); zKDpO+=>o&_4C(z+gPqq&a?&ZyLVIHa^DGL@?k9snAO%%o zho4J=$d`XTJ&~t(L&x>;Vmt$p*L;0^v}C#6(_G|~uaH3{J2r4|1ofw=@}LJafjo`m zn6>uN(*|j(t2utUX3d&Pcy7P!d9N=#FN$op4%iYsk>5NSGhj7r?ki-+LYWnJ25Z|I zRydWJFPB>9@+`HUc}cC#?zKbV0x{*Em7jgRCZ9v`nR;ZPs`0o$Kei5d5}BS0Q>R8V z*^fd;5E&+`bl!`d%ZE-)IqP%`p8wkAzqa$MyKgm1Yn{tO&vq`49cwx^#>QtyI7s3P zXcmr0t@lSHw9w=JAKtp7+_m8U8>D=j zo5AbApMl4M?O+gm8yWu}z~6&cfG2@Vz`ek+;5*3quLqZcVXzb27yJe}|Lx$lK(YVj zEASQM`!9pngV%u_6s9sw=}4+b@GC!jGnpWo+!)4>md+^<-I@6h;UAI(*L ziXukaqP!?psV+&zY&jb)4(KCcTPD09xUj8wIKAnS&dTb#bupY?p|mkc7DGYsIh2Y# zV!H?g1Sf9M#VYX$$6}nt@@mU9wlmbef`wv z&Z&uW<4!Q7Ykw3Oc&Iut-Xab4i~C*sac$(48gLC^Vx7hqQ9ZVHlm|MlK77Mih0l z)5JsO!Y^WRcd9$KZJ2`lWMTPey9JGC5zYQ|p|C%?YcV>s*{lpi(1y zl^(-_rL6TLgrhS$)g{QrzV~>&J&5MQhi=_sz1rAllt)i*F_J6xa?+;FMiv!oGe3%x zz}5>-97kr+60$6mRV0`2?y}_K>5iCOLNVo4YEg1w`4#CpA7|kKr`z?t6EogRDZ8e| z+$00sATq$LPtj!il9df{YGO7u2(pnaDhhO8pw%bq*3I(|Trogw-GM6}*}h_+eksv= zYPMIethUQ#ffT?rlhyQ*-=g_daeALSH=6su5ooiGS6=N?i-Lzalb|uo$EC zngCyn(;G7({lS68z!0Hv$-&ne{_{T$`EFETBXZTS6{Di^wMze?!SY5R4%C90QJYlu z=StHN=V{DrfKpugX}(KNC3<$%gNgEd=!_HBmfjEIFfm!(BcYtXyrJBBVc+B(S!h?j z@mfXb7wfg%LG8m}+4CTSsb*P&dZf+1OVXQHR(G~~oTQ4veRsm}4QVSXUY|@S6?#Y! zAE+tOmMiu>KMIXJ>Nbje^Q;YIu}EVy8d21?5H8i`Sn@?dpI1-~XW4{43`*%dr{eQo z87-%4Ro@heR!^2C+-TY=6Ie>XNtzNNdlA)xnSrSxYf>NdoMee@C7(ArT^Ca1^sN#L z!=F=yzS7649$$x!?f;5>mHf!4IZ2s4>q@P9B?l;LZtyzrEHDRj9^ho~N8ldd?%;0Vf06a219&yq2kr;{7y14d;O~HB z{zrm40iEstBzQb{Ah-E=p#e4kuzl8jE%^z>(D$X_jIahy~P|v18LM#jW>~lbRif_6p%! z2`}5P5HF>#JZ#4F&P4ksVT%Xx5G70yycYLcC8dY?K;0}4bF*?aU8y#j@8q&^+PG{^ zA}F(Fn^T}!hWQ)8V=C`|z0E%4mEjL7Ogj^QXf%#`C!>u<9l==;`5RJ*H*fXgot^cQ zjgf9_m62GH7WpeIfPME;yh6ph$&SdI0y|bJ@2Md5E!k5&Iw*1RwcE<=S{+pQSDrZE zk^bICWw5Fk-O}@O|AAc9buoom2k>9f4AG;xybGHu%O;S{|y~1%vk7S1a*1X2Z#2cub$|1=5f+60=Fis7UrQz2mgQU zy$O7rS9ShBG(e#g2vF8SJIDo(gFIdmCq%>{juR&)acsO0f=Opa(nwY;X@o|yqBstP zvX-SSwD1E;SpuajWoZipLLq@d*(q(H5TNW^fKXbtvi`r%IrqNX%t&L$PV(<3^YLRz z@7(w9_uRAHb9sZ<_tw{!w1zo1HdbD2%`>@(0zNUrlvKp!S)}&^_7;YU=J33q57><% zW``~9u66fn=lX<-(;nro*yT*rHsPH^bowBRAVU(#*PQoal%4|#~w?B@ywv zbSr&SiHCOWL1+mfDzYbtY~ZW4n3zz2$^~H>XN(xyJu}?~orqcASg?2Nf1f72rDbO$ z$gGaDsmm1isNLS*TXW&ATSkPGh0g$6zQk72l^W)k8~4sY53~Z*FIqXFZhbY<%T6=l zTwUmQ&Br-~a#|ohcZTd}5^>!h4w5X{IB%wH${@X;?e`PqdLY*VLfp4T%#Fw!<2jTv zVaP;Q!+IMk$DN@IWwOhxgUitgjoERke!>a+2_7=uE}0eFD-q{LAT$+N0F{ukv?Hj` zP+dA5O8rd2+kr;bMrLNx?`$-ZpjsMi+*B~>Tsqf1jVvY{bd+CDnjl|5`aHDVAJjHi zcnPpRD+P!gI zx0;w~+rLPL+^ok-j9}7eAIPBZ%B*GS=@kA6uX}ErX-=Vf+{#7~^bbDpc}1cBwY>R# zMY020#cfqAw9EA*H?M{NSy`#_S3*ccd_B;Oe^WJOq%_f*);1kAB@4Oe?ZUx<^upAT zHsSE~epY#QSWes)r~=k{sID^OeLShT)LWn%SW7DBrujq5GR@W@+k9+sl39EN?>)2_2knVus(&oe&R8YY&caW%Gi>8j7HB28_| zN7>k!96MCzZUK3xIh_iIvzQ<6A-gFXmfBbgc7I~D zAQJLkK8a^kvqLEZ7w;W70MjCp4)tGCuB=)y-g>0snsQ#6l^hQ06Wc9Gc9hX8J=8RjrpjX z`7yBwso1<#$SUH4UuDco`a6EYh*z0Pqvy=%78Z#Ey~Y;Gv4r_R>uARP>M(^4nf+ff z|5eDGlKCu?hft-zK6X3A@FMOY_Jb(0dg$ z|NFpu!DGN4up113&!Gdj9{eeIE@*>UupOKLeu^&OCqQQbo(H5K_)YLWWdG-Z^T2xW zV`TgP1d0V9KmWUc8<6e44L$^(3H}f$-v1Ag?f(h93JilgfPX};zXGfVgFtZrZbYWP z4!je{#{U;UcLn|e`TgI)+rV?dCh%X#@2>&7fc*B=r4iUm1zxoBr&PkS*t2 zLauCb&(PLv*yQ4SD}y4`?zvfsMgV)b{JQ)i&B&NC@Dk4PXBElZB4ORxcU>CbG^7-B z?DlhoanEQ0$U#?;D$->rSo3Lua3E$utshnN5uDC>*Rl%Fy<_F3nu&U8d0t6-u(n6)NB3)^sWdy2IC?1#^}70x^S^!F27{4+J$wNt%gyrC0|9?(J323PVb;^fz;TU zB5A1LiE?@B?am6WfWFxgXD5?15tUXP=i*}A)S*D|t%05~b}F}=<;t2=z$fpSldX~c zy(e$DKQ7E(OPreC)Qe%@la|Cj$tpC&+M1|>UIij^fVV-<= zD%Nv0t|&LBX66<>fDDdJT4rr-A+QTG1NoGP9x9Isj`k?9zV~5qB0v}Y%0K2imy7Do zEnCJ&#bq0};tN|{a5PFGs+ygpdNsDg$U*!g*Qga~Jyk{bDz(kuGGzuri8}Vzg3^D5 zU1t&dF1oG8os4=}Rk-Sno@H7jMHZWrbeXCW_QCv8U?+v2@hfv}Q%$q*wGguCy&7D| zu9;dUQgkEh>Be_)etCqs&@P7+6pBa_g_5IxaL2}lf*ZsPoXA!83^Ek`o7eh}7Vw>Qp~wuUVZ%0Zqu7s^^8mW0&7*nQZNJc-H0FOJX-y zMiz0b5bewqh4Jk^q-nWAD+Zoae))aAT4R}wPd8M*S`BF{`K!|Xg_^D@>=d)|{+-^? zgsbjaawYN>(vzDK(syb$#f$MtA5FXj$!GrYPTN=z+y{mB(5q zuH$1ku&3!}!C5-Ub+@)UL&b;6+ry!ey(E!;P9{yFAncK*x6apkx)Kz>P+aDn#qL>R zT+%)*sFt1V=Ed8kvkN-fWRSXMh+Lbw$U8|ZU1-qlX0>~2gKal%DRVOdRMw`s&kq<>XLDyq_U#&(otUA{Ql+6ug(Qe%i=j1ZkWo4_+?xDfW88lpvI({e;wQtT#J0a2u=dOM3(+Nba5DHBIs%>jzZg6o$nRfzgS!Ha z$J>Eo1iS=<1?1ZE+UD$R$m^yH34?7!TVyX4LJwuNxmB_QrH1wP{7ksWzBO$u3&BY6 zHQBz)Ba*3349H2jUx41Gs z=$~~qOEO2MRuV^)XU?kY17~)ws=th}&?8MH?V$o;*aO$3!gMxkn`>H=gz#Tth*20( zVNupH)wkO74W!l&P?Sd((J6#ehrvi8$t7^ewh7yKrTOT6meRZLMgpbS71A?URyL3&GKtXZY*{TVn*HJ&^pl-NuR?(4X z%kunWjQmt#j^|A>ZChYb?}vPYV|%-kQ^Is^iZwqIh7E6&pp#XSV%hD*?7{IY*N=#@oJOut~-BIukW~M_x7Ee&$WK5Z94z_?VD?xw_mWOUau;S z2tj4%ZD@t9p^~~sW3|)SOf<>ZXefKM*@kuDj%qp8bXnse6Ny5)4sLj{ zq&dTIr1P*A(s(d)@?Cl;)w<2DDNly9rtq0bH|Q^EzuIuT2Ub~uOC(5ErL|G*9IGzO z;=NZ1(oiMJQXvqHNpUS4nkrhD(njJ=CgXx*g}@mn-?CxLVm=hA ziR>4JFtZOPWo#VaU^Qors_g^gxUa?cH|piwVoz!xW!{^;{-Izua2|dGaOT<%b1*tT zHB&*}8*}lbd#?}Yx?Lv)i+!D4-*LRk+UjxwLXM@-=EipCvEQnBe@oRWDFVZvsyg5c zJ+t{4E9RRdE=Q;yHC@<%!lj2soaNy+Sc$*#b%>=$Ow6^ZJBY)J&$cp=3F4S37Pl8cSTK# zk6v~Zc0(uP*V$^hsluw{R3;nWie=gHN$e~l2@s?S)KaKlxF^kgW9fG7zNPGYL35<% zQFUyranRYAri`#gu4_x81?RG0>Yz36R7;9`?0nkA>X6G#3dvX#S%C``$)!!TRoN!R z-%)~xK971PDeYtl%4HIh$Db>t6%Jt-Mp~#VC#A@0s3Hd@*pk>1<{Io~xlC5ZqE>n~ zZj4F@F!X&|k(HV82;I(;#W!;K#7e}Uo=>XBPviP~U}3Xjjbe2-x0?Bp5$t-o3|DgZ zBMRo^Q>++Qq&FAXCCdzQ+1(A$%~i`QHF_)2SLAX_vI)OnoC}5zSL{X#E2TQ-nT8nJ zj2ao=Mz#dE;VzS}$c-j(bVy*}7PY5pqL$}pVkgeEO#UZqkj0vl{eN7-KcAN(_y2j& z0sJOE-;C`4Y%l>H1Re-}7hI3r|EJ)&;3RM&kS?GCK7y?OPv8%MWdD1E-vRdmUq#M; zGk6n_9^gzc1a1dP;Md6ZlKr0nwu1}6S>Pwg^q&A92Oj|M2QLMZ{SSft;B@diAS?ST z2H+vE8VrH~@O9+-SAd;B_xybbS^k6I8t@))0Gtmb?<)r2{lKS?>#qjy1aAO}0k{g> z3H&DbCNlmT!Lz`hfLZWha6Gs>xaK&{1Av!+XM$6~gTcMQoxqoo^`#TgnSp138PEdL zKz;>^33#=1HBP_9Lv4ymO%zho1g15M6z(}}@bb=KigjmVHA9Lz0TFJdop^?t!rWE# z-%xp5b7}{_#E@r8XgZRLOx+qP7AmdjAF9-K8ILw6%Oi7XKq(IL(+c9) z>BaboZ?Z@GE~Vxq*9_;ki}((ik~}nKvscZ9lu}HRbJ59eGfI&9`=yFPJ@1dvH{(s@ z_KTroOUg%|0)MYZf#jw1FbXvpo6AS5iUh#`p%XqAkVGGts$sQB!qNUQ*(!X z)HZ-5kV?sOPg&LuDKSosd4xDzR5cy#W|oZCOIjH>B;lEfnOz}iEv54>>{qIRiFmQl zG&9MfX&@Atq7_DzjZaOGzF{ zww0xF%1JT}dpRSEOw6G%xJ0T|RXolpZ$hlY6nZ>)4y6s$RPNsCnxo00iN;R4$ofRn zGIYCC8?jC5O~;`HCD)jBq$}Uu!g-v>hu(HR-ub^yN@Fwn6Pv({igIbdb>qJ?6UB8GBe_=$PwJ) z8p-PXDnWM>=YYDhKPDjV|I+p$_}O+X)v@b|<;%cfKis9`u40{A;0aQ)ZatHvWqhMK9b=K zQ6J~k+L_co!KGG2%+iz#6OE0)=VzGXB^3g#&dkb8F1l5#F{#*rauUPgPJ^&U@r@?z zupOZ*j8TXt2lvKn&d%yqOv_rsyE(OUSfB4@V_a1JmGdoPPeI&RRB ze2<8Fxdt&qp7NUhgyl|@G(Kw!(*o0lhvDECr)BCmTsmQ2OcLd}TwEl4t3+#jCW$q% zSDIi5p(-P8TN&YCXN5JY4K!_DQUNnJvWdskHJR*cGpnk0M_yXAX12;)Hs4sg!^v1? zk~HVdXPtk3rC|C=`zcFW)G5b^yy*OY6S-_2xl;1~75vagvH#x%{vI^Jso-14`!4}w zpb0Jpiv70=T!*~>aquGWXW+>|egW%13H%o_|EIu{!IQu=cs!6lfbIa;03HH<6MP!^ z|L?#XfbI$W6VL=(z}Y}|1Kbx#PoVgK*MV!myTRq)Ah- z7%YHC1NjLY4}O47;4|RO;Hf}o2(|+03=})y=jaH&0X_!a23`)H24=w?a0<9DkS^iB z(G~nB_&oR=xE{O~$dBL@a5C5c?hd|>-ryg=1Xu}<1Bxm5X7D6%5Ud7+;5Wgy&=Y(D zyc+22!5@MBU;@PZ2zT(i;tk#d91rdee#jV0zi=(M3cL;|7U2Ro05k_41kCYFD^jdY zV7byw!+)!|oj&XN6n4?##?eK~kW!XH;`wR$t=6W{TwEF$u@2#XaiTftTDnA5FXWQT zLv1j><(O+93X1+)qLo>jY+SZzf(%u*;_On_ghE%0&+wI9;6U8xD?SW=X}iMj$~+Br zz~p2fV+IUVwi))~U8X9sB@)D67P=;z4Kg_ZlK#%dsbnYm*p*&tyM{* z*HMEm{j*H5-QMV0R=0`=E<3QKU3#vN!0Wq2PQ9}TXAM`#1SPT52B@MgP& zDN)-oOpNWLu^23pagq)sTyy*qw@z7u(wUH(dR?kx6Fi^DTXlpr>z1F4zHZQ=%k%|<>@;fr*8xjm&wgv>3FU@!iAJZ8)Jju*0>?U z*5QDsS@M4Jn6x90!sXsQ)5R~mEctAPn&dmBHOX#{buj)t^fGCCg#Chzh!f>O{W?w zi%mLiJl<-FVc|<06ynhdJ&b>poNP?AZTF4xl})E~Qx1Ed+TIcEH1l0B=d5Sr*vU8yd*vJa%QAaHnKf&hAE;M9H_COCg=T8;c=44M(wv zq|qegixkTt5g~B!W0sjT*Bl&XaC6eVirBF6?r~pJv!HwD zN69xe!j!4UWK*~8xuQ|@*i`IT@)u_ox$qrnZh^zt8k12)gih_TMH^EG4--y0FtZ4~ zfQufCSao7Ft8I zJ^)vML!b>L`~MKx{@q{{`~g@2?hQVK9RF7Eci;s;@&C>Q4+B3%ZhtTMYj7E8fC~6J zGWvVLRp2kcR&Xvj2b4jK1^7Z_^&Ma%I1NaCzY+QTd*F-U`QV8_zW$E@L*P{KYvl5) zz{@}rjDitxI=CA+4txfA{6D~DU=8>*viRG<+kou-j|F!Fy6f-5K==JkfyaZh!TrDw zkhMPq6bEn*xE&~g4!l>4mWxTuN#aQuV2nj7n9Gv9X%oG%BVrG(% zRfX0ltJp)4I>}NwIYH2$GQ|c_Bi5gWW*QT3=F@e`jf5gX=&HV~MT~YWPmBtI^V&?j+bn~SdJE^1Y5M0^>tvhos+6pG4D?K|6}3DPk!RHw#O zoN$~U>9mkd7MJ0$g&3!-26T#Z**YZWUCnc&XdUXOMYD|su9#4Wl?qd~#$m6NGgLG5 zNtt?|yn5E3#bbG|s+1^{l&wCY9(J;m6JTK_h12|+VpzIGd=eMyv=d8uKeOY-Wdw&6 zS01FwUsmi7Kb4k==@=-gjK1#sYln*DS-YR|kijtDSs~kO?}}5O@62aCoNl@zOOJgH z6;<^0%rZF3`afx?+i%Iz&Cb22jLq=Q2~rr&xc{nadh5F$~CR4RNx)B#6a#lgc8?ir^0jU}mPdd`*Sb#2adc%C}7M~2pMd-F0u z>fV7X78z=dmDvDr)U!VwIn838vKazm5?@ zk>ww5W~!l-^ z8pY2o$>qox7>Q|zlf6;XKtN;A$_xSRi=a8dfugCk?9fw9ix%xHlr+nKIo z!S`w0gHt8`79`N3My(U$c}XTRM%xy<+$$Ty6O%BNPH`=|krf;$GC3s836jk?hw9|; zbU$=U1uEeF;C?{%_?H5m<9`VF9HikI z23!nI0zW~Xe-+pXP6S^-hL7_7EWbYxs83%4uH*O1z!1>Aeh&k>>+d7reW1sdsj1F6 zuf`LwLu78r7KKUytq4la z%#DZM8|)(`Wdf};KTcoaZH|wGMX=4z8b}(2@R^mteT!i}VQXRDAkKI?w#%cf5ffUi zxNr`2H{>#K6>%v+M1$ArNbuTaQ689Rl+V84G2+OyPkJPz5WLEQK6&)5awE#|T6!X^ zTVURtSggjEcR+W_hh0gW*+9KMR8HxXam#v64Rm{f_go09!O>a>0GZP6J=0T-7$(&o zY-tITf=h+`20fQ$BICt%YfCp%}fk6j6y^E*$gZJW=1&A57HQAd)8)x2EFPU@^W6 zwI1V;FvDH1YwWxuV_hi~7gMXS`Sx=$q;r2L4`A4FtAlbfKePiAbI3y{SsY{zdoD=C z#Z5JM#c}$bP!DZ08z>Y%5do8)FpgdG!eheN4FIlU7;VZ^6Xg(1HP=1AS|uInFo@03 zuCFtXmVRB=zUa*uPR51t=%`;TEHGz!A^aMo_u>oIvTf?_5x*WYu8|5RG!kzoD9kXt z?fSLrPDMS$`m;5DTA9dcaZyU69hR)d(PD|&xgZs9uFMKJ?bm-83#1TvnU~FMVihTu zmoD~nZlCX2t}XSY%C!Vt%T=|hlXX{Xii71$vrNS^7yJh5D784pUaZnP_&wWqTb$FA zzeI7S*`PsanSjHyEe<$B_ZFC;>KiFpVwN~SP7#Mt$`t04dzcPF+;rWv{AcW_yjLZi zU;{B_%_v~aPA^su(yqa@YX;NNIaECu=Ek5V*I+V12W^%g(&`JNl2XA~MvvN*o$U6k z)X}L<+Bw1bwxcWj`LXGOT$!B>_OFk2LnUv{DkN-2Q{y+srm->i)U4K2Gc93vLSi-}IN9YZQ>jPN`Nu!F#`Xi~`P3;j|i<|Q# zC7kN;OA#b-7?GIfHVX@TBTuTyeM%Pgyc zva~+UX@ZHyYVO?*lpt9M3MFKR%{a=W9Wis{j6U&@!0Io8@l3)^8J{NJFxmztrmaAq zZRR?v4V;i-ijK_nIWY3(sTO-G>}UiKpfeK1Y>*;KQenB@WK^10oSJ??8yhs-0`c7c=aA!{1{BwS7uX6);8n=+Eiesq@89o$c*g$+$n<{& z#z76lv;Gg}_g^F5%jPei{l5ZZ;IUv0sDhs$?|%}!2k7oU+5Mjf{v62e{}tr@_k(`` zhro7l0oVq9h3x-op!ok60LAe8SMUk&esDQ>F!(OA{&&C^z@LLdKsJBH^Sd1=fnOv4 zza7kiCxG7r-$%y35h%u=;`hnMe;4pu;1~3@`ujR?4d9-Da`-H7^XNOF=*bZoM^pkN z^zfZ@k{!ji0TakV#A{zjH@C}9nB*ljD5G#BdZsSgtaE>u;x7B~qz(hDZD+Pu z`sQjwqK$N@o;AAonnVfDjz$1EIy-D8)$$Iqw_J$U(IOC5d!_~5a~UWJx@3M_L)wh^l#x-HC@A0nIDy=JoaLcsltP1VX^>912tq}_u zYd7k3TO`COBruI`!E3m8<>9ZF%pB{sY`QL)jGy>GxsvK^RvqCo%H$Uvl5J@pwbkTH z^k}Uft&mygXbv7K>953;h}|tYYnt^G@X2BXhm|eW9Sm(3t}#nXm<=X*G>O@X^H&mj zJnq?*$l0kiMB>zqcGee)T0K9@u@4l4 zib(3oE4q{32Gvv9ueh-;--eBmDYV?x+xmE}RRxxM50r^A&f9|l`b z^yyJS&w`tpxnTZ)SJKmC^J0V1Q*Bt3P$i(?R67oDOr)PN4+4X8Z+rF|W(e^GLp@iO zPcN^HN}!?vnF8>BVYCx}G0xC+@6&9ngUA|Y*T(YRZ~!Oc@ioIq!2tC;`~G=kR{+EQJXz~OBo?`p=F7T!EWl8wt1L}Ieg18^Vkg!`(Q zo+iog=xC8_3Sn6G8;Bt>a@i|1$yb&(*Sxh2ao#O*F3Y=fQ1<$Uj^Hi%_Rzj=S^$&} zST=NfgGVk<=v(zHr4vjdscVqY_f3rJz8-}4X=}@Fs(F~!ILk6h=IKZqtH8w%&2SHK zIb&s0omQKP0RrI~&v=frnG}-V)zpq80u`l|L<9dv8N!HO8#&1L!a*=^M4Jz5iEcMU>-aQ{21B)N8mN! zN#KvbB|xzO*Msk%6L=$d7I+MJAovn`fOiAM1H23@f^jegP6BrT??*52Oz;e_5mdoy z@Og9rZvl@3j{rkJx`AH-#Q=Od(0PFSg8P7fL-x<=1zybGv*1zS{@}ln|8D@-gU^CD zfIk73fb+ph@B?%LUj^?3?*M-SyVz3UZ1#7@B&=2Si zfLDTZfb<1FML+N^a5Z=*&>8-%;9Q_P1a&_D9pEY8kHM3H#^+zq6+D5T4*~Z8cLx8> zGd}EKA)ay+D&cNsZKTbIO>BcnmKytLXv}CcUFR0Ku7ZA5d+WJ{>j$N7(=wi`We!m$ z#JEq^uTK$F;}S#>f<8EZK7BrfG#JBi4AES7?)cFeqT6*!F0eq%KZygaB?!RciHxrZf2;&J@Kbwh!VxI) zBIm(aMiybu^lWpSevXD|?+IxmlinzJe)Z>j=M2aeIy06UOWY(WGgpV5OVz-7rdt&< z{p};!#R)J>x3mka_qHBAKQ!wubJ5XsPnDWs<}c}pX77ep9Em5Iy)Uk#b4@b+JN(`( zdC)xQji`k1Ra|_-N3C<6meYlL6FjOiX6!_?bX`J*Dp`CH`PoDzK&bJu9#R>)m*n1$ z;A%ZIkoe5>U0iCDPewCvirLmA-`#}fWafNA+4^&T@wO#(=tubVDbU@?rQDa73oG*0 z?&i|Lw>a;(Btun@wv2U`$dhFyqyAY0$A4ESE1jVBzj9}74$Iso|Nn6W$ZtxHME-xA zq)9&i4>|vZ;3Du?Z~?djcpdWnJeUI`;LhM%$oJm_p9Q)Pa1qG&{|~{hk?TJW-VWXZ zo(Qf0hk#=KU4wl89`F?K$6yhx1)oC3e>r#=crG{z+zZ?j+#Xzooc}5yzkp|g)!-M% z_PXa!F#!G!yaBu(><7OC?gj1)u0g)n`Tw%fDAf$aa^MmGNzxE?$g$p2sG0xDo7cqmYxKND=`_YE4_ zjOUePvPe@ia=NRjqxZ|<$?HxMWehRxTc1KYj%F~EC99r`F(Dr_pM*@Jfuzcmtfbb& zK%1ZwRqZU8U)JUYcjfy3JI(v8+q(fR4~#e5>13-l<2y$50@URp)9fXUHW%=UKZf1M zQz0SaQ6XVBMaeqoMqju&_ufK>)F^4lj7C`oOK6$0EQ9Uy%}X1K*JDTh#hEX7U~)@` zgZ?wZsoa)pY7HG%% zu8S+A67*D@xvfG0nRHHqe)9WeWVCCPvuDGCH)I-kEb084cR2>q(tSl9mRIC7r=(5i z%t%;7Yx)9zT4+|E9+BpxWYbDhYkt~JBOm5Kou0o{pS^(40`FmRT2jq3U|m40yM0GP3+ul(=ciQ>pKn zZ#vv~QoT8P#IwE!Q7G(3HhQTceapPA9>1_Lt3~cX-ZYK_&VE{!?eZFy zR}Ina>;>o9B!gDHA{i}s@LHYb&Igku5#DrfWhQY4DI>h5)iefJtLhhzu5}B#N%iUo ztE#%e)>qtIa8Hj7?&4I^W4JKUXZzlAX0e5S7z$b}m8H-B<@MG>dEPOv^2+nTmzl%< zkr{sS3?(zRaKVOAscw5%k9ptcg6?Aa=`lra1uHt)qcd~-wnZIDUk5e2ZBfUYCYN8- z$<#X9>$=ITT{7+=>(uaclU*~sy{t8+DB|vuoo(sN*oPQN~J9sTn?EkG`6ZjEw zzTyD>4|p>;ADjn%j(o59e~R<3Gyb;+$ASMvzSrG;DS)61WnKfK$Q4!71Pb@Cju5e*kX*GeGzJ{R-Lsm*5vb_x=3^cs@86C@z3( z{fZ6nG30&O`QHK*`)?6k3Vt6%J-{@w|2bd_P<(*@LH>Uc7zOtRiV5&z@GYP_{+nDFE<5@^HgK?g+0e)b~)+GR;xExaZw$?_D<(?oCqGN%6_oYtt)C<1_SPdk4;h zr-P?DZ{ke-*ycaZ@gKYW#}0n@+p@XWsKZoat|p(GO8YXxy?XcH|7dM74zS=V+HGSx zy1F?vS|1}L=Kd%qGP*gYdK0G|M|{?UhFDBVqWp7Mz8TJYtKh~k!$}MV`QlZEISieu z7Kxg*u#a1;*%(k|%-f~tQKdug&#Kp_ zvuvMte0kqI^MNzujZr&6p7fYmV$u!^5>424$WAlc4Wm6dy{L5II)}#W;_?h0p1tY3 zNy5kZ+*15F&U6+uiUk$87q~0LwXdw&8?7kquw-u`p{S$tEM5?+67!bTU&eDzdk z6*!F$OD_BcJ%=RaHt9l8x@vi+ZPk8MD|09^wH9=>5%|Q527=T?-Niy6KU1?n*b)&L zWW;;9q8eSqbDbbH)gDHIawMBx=c8aLVBMlDvFa^vFUYsNyRpE?dk5WC(;+|R1l7}o z*G1!UIOc=6WDbXdcB5T~wJ4V~9;9q(a94m%9%BafhV#`@aa@3Elzz8q~l;fqVh}23!nQ zg8xD0e;b$ulVCrPPGA%GK63wcK=%REz(wG6FbF=4-2boO6W}>O_xYU(9tso};QxUa zg7biM0V~11!ExY!k^O%P6dT}=!IQyGPys(g-v1!@7ofWU=E0-E3h+zh{a=8ufJty? z@N;DSpMfue7l7?xD>xV25quNb{~O?+!TZ1q!4tqHaDSjX0e%BqiTuACNFN}7|0-Aw zegk}!{+G|c`~()kr9fjMpa1NK)g(Q5Aov)!T$8HDnW@rPP9B(@pEg5hh{z8sitrNH znF2SS^R!ctV-8u0q3pb_s!t-+$^Q>rKER>Jfy*C1efdE35_~Hw$WcuqkGWW0IpKt{ zDO_E2ATgv%Pd3JH_U^|Ng9t_ZTMj-V_OBjTrAycjl+-s89OLmxqjmWMlra<19~`I+ z3~`w_w>RR+0{BPf+ z9hqxa+8nD7{bIeg*VQZ<1bZIjG1V+-P?xmHcbW96L6@X@oJkcL(IsKrgf3328Y-s~ zy<#Y8itj*8IR;x&4fV-2k|`1fPhV+@q%Z2acgsW~f`!LJKg9S*J>}oDb+R~!?LkeH zs-r3c$sw`mE2tT@J)C2+Kth=>qW;NTOIM{_UM8E+hlz$`Oa?GBaZLk=yJvBNWhZbZO<^K?5#@vJ`Qc1Q_c5O{S?1PFI~@} z+3(!U(z;EaMEi{1&d+@9rs?w2_lN(aFi1{p7NP|6! z8mOWySsnJZFD&lnKqd@SQ|R^ke?OVzI|JpYlL(zLa|tePFuGgiw@+hHgxhK z7EO$SD+SZ+L^Gx}B7^^?KRfN=ywhi`avNQ)2?Hv09i@iQbGZLJ)HxULR3>tKo(mz4yrX++g-P9O}VoU{9Z`CEEzk> z!%0xfC13V{o_co^5v)0Q-A93z#QY)cy(GxN($$Q?RkHzH^?sqju1U<)Sq7;bKM)(} zIx=OEYj!^wgy;%s4m*>83(LRmU6#HZI&i%jRSxR!c-Irb$;E$}NNxGHN_ zM$>3m9GWQlXm6>J#%-Q!*Ou&Q_|Kl3(X;SHvLmG?%G`^3zqip-*<$tFu-2@eYcmm7 zv}2`N9u2E!S$%q*{@WI|;jC%XSj!6a3C2x+lXX+_zbpZNAMF2+4Z^1E{(lB!`+qPf zgZqGwA>YTl|4!!j`+>Uv`ToBZ$nXC&@CZ-^$AfPm=U)k438Vu!1^f^4{kOsA!Rvrx z{+|gpfFC2@%MaiZFab^hCxcbsgUI&#!9&4Uk>$Sv{vF8X|01vt>;~&W349BgUg!Kj z1onc51Ksa82Ob0N2EKy~|1$8$;K|@g;8gH=WcJU2Pl6|bhXUQ}_d%d@`%edt1G)p? z5#Vm%ACcLw0Ox{pz!tCq91ng%AAbg@uWtaK1|J7+*A&(~(g}Tn7HOp@+zx>>YQd*< zqnXbJ`%YLYW+yucM<%tKY+UCYPN4+d@8n|UGEw%v$0mFC52q{HccRu-*h~2lZKiNX%kjl?`I8^ zhviM(zoxXLa&DSqvRykP+oS^F+RY_NZ}@Es&5?;Q0y=JB1 zza*@2SULlbpUx@3O6qWZavav!Kv3JU(@l&HoxW}@n;vfSZRv)l_K{BF5WwEfCC#kkuh*_c}=+@pPt~_ zld`=@aj_ufSc*^M&lOsKjq;hBSKCG^7m5*_DRd^CyB{DcQz%nNF*QmUV(aknnR&z| zzw2D}FK^nh)x{rA)TM}hb8|E8p*3s92_`l_TqTs$nw{G>Pp$D6*6=r14z3y5uwmWE zi6f2X=>af;rThg z$20YBip@-Vn@;djt4lPgV(uiUo@Q2+%PagAg7U=pG-0dv&a{Z{Gd#J-)z)nBy3-(K zt5q78Z=}U{MxYCWU*%`69bv?)I)QP{j4nLE!rpUigDmKD*7RDf>FU0T882DuXvVpk z)=0RqLsiR7vy~z|VjWqmDsS8vBi7|I850LQ6y%aaS&4*x zj$((3)+eqr?|7Xr5v7wV+ZKaY#N=oXU`k=FX&M^y7N9Fbt!!2w!&p4*|+KN5oaN+|EVt^LANllxf7K!;#EvTAeV~Q8YV)j7ip+SKRbTn$4y>~%tZj0WsYk|}C zta6CnMXbc|dPimb+ms|E)y$KEI0t7Irw>d`R`uUm#s=jk8J-}gI9u=fjKY&q10q_dY0@L7paA)v7^Z?Qi=q^CT z?^C~j6?_87)~|a3?31MvD+*UzNPKDIRb$tH^qo+Lpkb%QW+BJoe~O23)Jx_?JY&c$ z+$0Hb?>5(RrWB!vr<>(7QBUkIzOdEh$l0bQ8vC}HA>OcH-HW?k96IO9>P<6(l>=)fkK(gzj)`7nEX?Z4ep1D%bs zX7S>56W3co+1)OO)S;GXqEK@5@$D!YeSsqBuC{2SLqp}6MJ{idcI}OI9NyqIe91k? zG}l6a%lSfKMiB`7YG%4Q8@G0LegseSc=I}Y!!Ey1GV8jWbz7De0TJ9UK(LTrg>Yh~ zyNqx5AzVv|*lPPLQ>sdXp z*sk(e>%_BSIH;EH^qb{tMeqpr`cUzq^7f!^)0Ir}tI4EE^o2dr*id%uAnWN0Bd2=m z6F8zJ#wG336B@JQ^yCRA?B~4jc-yrOc@}r+?1J_(8KkZm!q!aS^+^ZmYV9PgPQ!$8 zDl$TJUTb8HshIrQO*-A^j&}hoDvUI(m5w*7V9Sh^-KU)MjTj>MQ~5E^?6sWoKHOa*~TP#C@%pc zhUVbK+4h%WJ@oU4+P7aFX!O=)gb{j`6vn~lyXlbtr! z#nJ=2OW#`_cGAjeNFgJ+A&|>iei5!u9Qv8i3rVM@Z-hmgcB?rtln1;r-nTUE7q@S#RrPI@aJa!x#B=4radkL>BI|b$4BzP0nGH918%b%Y62o#t`7e&|sF?>-(aP??zpVvlf&c>3x z0#%3Ks4Tsx?yntdprEQ;v0?3Wn?7V=u8M&ZBsbv!%sg0}u{&##>J>>RTsa=vJ<)Cx zQr`=*CaV_adrt@RhL)=O`jXZ#5mUy>i>-Mk7gxJZ%xIg|Rdl)_+TLnW>*4uvTaDrw zoC0*MyH`8cCsdsFD1TjR$3$&UB+=qUHpHJ1O@Gp&=+tAtp92$@Rc99K>SC^s?b-w586r=(PSUJ4VKLQWVy?7QE^G{#!eXfP%yb*{y~z5;g0EZu`!osk+C~ue zafvDJbl>}XYc4)yi(99fGs?nefDJhp=;(?IJ4_J;;6M*FD(V-JsZh7R8tG-HnXplY ze%E}Q(|tV)CJjWhyb%+w-6QS|8%vxw(>BcPaA!{!RuqYHEr8{1jrdVfYmDbmvV&V9 zKdxE{>rMQt6Z|fe3420hFUo~2xzwALh+IT!e8}-n&gN+6aH0?mtF@7tne;mwjjjgR z?rAdHZ<-)qK>9qi-5)H?I3|YGZmeLi1>4Bv%Ti;q-}b>E%KpC-2J8cpJCXk{pvds1s8yo;P=7Lkk$Ve{1nJ%|7x%gjDx#@yMp6@WcU|>r-BAJ2W$an zgFAxnA;W(hydFFaJQXNzzwG|2!QH@j=-Uqh$@l8#z?C|cN+yEH)w%gF`+X-&8R)U< z0;frvl|VTwk?sXwKwFwE_MyrcH?F&H^T5KuDhYXGsrECz#X`(bPMdM;4#~G{PrkHm zS+_Z66qi>xl1f)XS2JEn}8uIjMb=bZ_?hhk|QI*Z}t6{gch= z=={`71(G}F)Tn!}59hjF)}UfvXC;d`Ugb*B6n+8xfM~Y1PNmZR=8@Q{c{W+KN{Ybx zr>YM4Mv-A?cEcojKH4d;#%Q`Av?$CMB_Ke;sk^&e97s0h<^n7N;ynDjg}Z9aj<4}8 zpvLCyBOFu$UN9HI>P{E7w@~%Ezu0ZV&L|&fTf=JS^;Qqr$4(mv1X~@8fG?RlrC6x zVlX}J_@(UYL0_cjF2~~oGYwCShgutjo-xXJmb!*5Iii`%f~nJNOxfG{=kQaVPhD~g zx~45+J5x=QDu(Ph8*3sPSD_-g67p~A?d)eoLKC8PNvb+YHl=K`;_oY@)(!y_Mp{_L zbx$SpgNhuOXwA3joVmt+;w5BXTp4HWN`H!Nok0zp65RASvz>sKQhdcRif`od$>yow zJ)aOrnYYt$`5p|?l&_=sU8t+t^CKg873VTsi7)WV!8F@q#RxawT!2k0GstDPM6-OV z<&{$7tVBzZ%Pq~ve_Iio$e~obY23BPZ8_HrH%A)M+h#6Mh9zZ7kf>IuWoN!3H!{aX zuLK5eQ4Fq0*7Gy56X!hKXMBHUq_j$=W_xp`_SMVjcz7MGh%F=y+&e)%L&l|;W4E8Q z8s?da%7CC?G*xCOSQ?25R+}lBbD1Pnvr166UR7(f_h+@hCX1*D3=!7Qr4y0yp$xr7A#KGl>ylxT zJjpPO058F0HZjm^L8v|&0p-@MnJc}^G@7A8NjAAcwSlyzQ!ya-C4J-KXntWL7Iv5& zlp?N9NC%aZ?n9Td?-uniC3+yv(*4sn&M}*;RVegA892f&qzPS(TTx)L{m-bTl^{uP z=9v02A6LPrrT5nVE44T<)4G=QSN0dk^SjpE1S}M_&^qD~_|XV6pxy%A0U2%TnkmdLz zFNeyZ3bK=xR1XPK_KW@q%Zi-^UIq1lQ9Dz+>gANQ=&umv#x`Yd7I~H%x@xhuAvhoVan>-Y{*)6)| zeXU8wKj*bTDP^T_@mR$e|4Em&xs589dC6&BKDSEjvXFbHpt|R*O`Ck32cllNDb3J6sUYD~S6)@e4$}a1f#3bsxssBn>h5X-{d} zbYFUNAPrx{_~42)p@?xcc@88^iwdJc>)mC1vu{^P;uv#*+ZB^qRBoxh(Y)6usYy9I zv!~48x|(mS=6M|NFu@h*83x~WQ=e$4(R_Y}IbKpBTU{d*x-^rE;-57p6)I310o;`a zVT~MLC(N*do2sqD7@g4h?8G-N$lH-A(ATBvOGQ~-tpnJ6 z!m#_Op?&egHA!33af6QJhCgFbNU4d)PBoo2=+cU_!)EQK*p;4rZEV^PU-L`4?_$Yl z;#@1Xk>@Tl&6%Rp8n3<$NvyWL(!}>SmPf<_w5uZ+@hYrQ`4XqiODbRiJur{O)U^Yx z34#%2U+{rRD9rQUwWl^@uUAEYfD;wp4(yTidg7GDdK z$4e(;$UsibJ3k#HWDZa?XJst6BEr%`5ymAic{Wx#doyzLqQXg%OoQYeR!Gcn7Aj=7?x0VIW8eisrCD9jNXs*4#tpRJIGkQ2kqU%i4`f({1&%}qb&?{2 z10^aVDr|}+s_(5bcEpcYlt&lQK7>fKzPSg|Q&Ps=xVk(gF#SV_X8}R7xf-5jr>{MV zETHWiY~}^+5Zw|g6?`k>S`m$LMoD|Xj>-A)ap}!A#WAiF_J+tCs0Bf}U)UN-`pQ`6 zBoC#C71XzdH=u_}*Ov5sm##??=$2ANZ%oX3`MoP}!s><#XB#te6;YrdW*$}*UE;3f z9i7v*(5c=J`3A@KhPD>U&N75&;%|ccNjSHfrQL^5xA(PL(vsLTPFh};PZcFNCsLr+ z6}4+6aaZRjCg)a5_--#wk8pqLG-~>^dqeE$dL3^~G>-Lpxf0@1@UA~IMT81Z3dVQ| zds^4p*=7$nDerpS#;RW5anbJWJ2#(e{Z`v_{`uQC*EVmzU`xGTRS*yz=@rL*pfNFN z8rjeaTSJzE#>#CvFuxEBl|9;QHoREUoMAXp7Nr&nat@b# zmmW&BZnNvgH^qrtBkxRx{3T|+jg#U557D+ng19Or|DTDNIe^?L`F}QuoS#P4e=Yc1 z@ERapz$3x;k@s%|UjwfJ(gXYs_%O2lv%sH#Cxa(}hl9I;y8_(C8_THp`CqriHw z4y*-=4fsj)0nY``0e=im2EPT~fnHz|oB~b;@-4VK_!)YESA&-U`4}|7F0cjs2KXep zfos9vfL&lGcqCX2)jk`c7yoWx#i_n!Q}OSvzL+sx1I9TAzAP3<~_{eB~)5G{j z$+_!^wy9I1IA)z&n`~URXpe&fk=ovIYA*9#Fc;gA{>0%wxO8ThC7<*08rvE!Ol-CQ5IR*lS2H<8KdMc zSN%**45v?`%O-x8+AvHinPEdzgeerR>;o>9nDHT~@xrV~Bj%QM@Sr)xmgH=kHci|5 zv8FA~EV?RSfm7e=-%*_`PDL7KETUt2A%{CO1W90rlxVpLVY;pA#Aqtc`3r2|VSj0c z-Eo(RHM?8O<-sXu)(g~^n*36-@A)rTYn2C#3sY$u3JYfrB@5y%_emmMF|mUW>%`y;|PyI zu2jtmg@>h_F}4p?UDb|$;gyt?7dPV{k7I($evF8gdb%u+}v6d5~Y4tjy?aq!V$n z;_kuCQF+5xk}h*#&Nn?xuuV~NlZShBepMAZ_LR6aKz+t0#^=KY%+_>{XOUG=7R)ba zx~E=O0g|AZHI4bnIaJH}WhlFsAK~;V7}BjNsYda`LmW}CaxKKS;X4x&>}-nmGMlHe zIfSh+>4-qxlT#UcyiMTzNgG~-OrNbkz`x6edTh>o*FAR?B$G(Zek8) z!X?tWs^W1*c@ttCrf?yW8&TT8-kq*Fnk*V!jU#VN%}e!(re)}MX+3CH!M`n|g&54+ z1QTOmVg;lFUIl>9%#4}HFjtp6VH za`1Fe11Evw!B3I*Ef%FA`24=uGxCopN6kqTm;17Uw2G0i10^7iy!FA{cUI|_SUI>Q4 z{ebQbcm_BZoC8h;-$VZYERe3?o!~iO4Co$#2ZC=P^S>ATHIQGy`QXmr8^^g%!Bza6 z0+V1rI13yHbZ@}bjN2S&fI859f*Q*QgWm$lN7F$cCSwglhMg`&?;Yl5@Cjk5d8&RA zCQgOHGdx+FPKz=tPV21jEfh*hV$Q;{?YT)LbArhC?qEBY%d$l+kVEi(Tah|6R#Huw z$r;r*-+!28dpbvA$&Kx^cBZG$84*NXy-P-I+f;AlX$y ze|KBM1!CA$#W&knaJ5CnElwy3=?>q2tgRW=(R(ch+DT7yq+Ex(Y`|FN=B1p^l>X8) zOk>Jel6odrm;dVW6RtxP>rkLTjV|Qc- zXkP@)2@VuZC7P$NolUNKG;?WpBkiJ(f?`MqZejGt_xVfFtV{weYT|Z@`;u3cF=84; zTkK-s8^c^(81F4KXIjf6P-JpQn&S?)<#8-Ja=Efdqvn<$W{3Nmq=a}!uxK}8eM3%S zSdWkxTMmJQdJUG)0HB;T63}@*&DUdM+nb+OD4g09T8T@8pxz?bvx*^)R zTu(x-eP|QpZBLaYZJ36^ZdJ4)N<~kEDZYU*j{N(6%DVVQwP(nCp5Uo_N293YF*|N6XCe1Q)IQ; z-hmFys2w=YXs#<=l6Zgk|H?NcS-OF~CHAbsGA~O=%+{);ayx3!rGJ(ww%Z$B%j#C~ zF!n($+NI|T3B0~bwJ8ytTn>kg*VhC&zXYN44 zc*QR}W;qEpOaz3N&D^K6?IDI48kP5GKJa@CG?z5>GZKU}~8E)5%?|Pc7>9l9kjxT!HMmy5w z%Tlm`P?LP8cvUiAFV&zzTO|LV!{k%^Im!Qz4+7(_k@J55UJY~?z%#)%a6V8BzRaBpw}a{af#v%x0tEoAu*fe!-7`11KLgBv8vgQtThf@yFekc@vE zkc|IqFb?ho?g>7JTz)ljzKpE?CGdPO4W__j!G%CE0pEk1 zJ`NN!;77>h?*dJ57WfhUuD;*R@4LX+;D7o1>);FE1Gm%eF3QhgvH7WC1T8d5!Kuv0 zvg`+Axc;usge7IR8V%uq=Tkk%gc(e)0vfb4DrP-7R6Q6Zok1;!!DMC)n)H5Xpk9x1 zQgY%L-qKfQVBee}wesRO;fCL%Qf}HSG56HAZU*^!#x;mjdx1f=`UJcC(e8kxmpXeO z`RdqoLC9X5aySVlRzDoy40@U{YeG^vb*J;el^RGOH*shz%guJqSjS0FH7E$UJ@xAs zBp<-6O|PEj3JJS=M}h^GKCjJgWW~VNn)`F2^^Y!)lJkfFoJJ*NRmf1|MGbl^oz|Ea znOYG#-qEI55Hk>R(ww@;E=lbFWKRg<{}LjFIjyU<3ZFMv#&;nJ{jbhXc@tv}%pqp8 zHohQLnC>F$6Cx&EFG_{0pZY9mo$IOxpwH490+8&=BiLnhcotW69M=2u=q_2|DnIDr z5?4Bji@|-bT0Xb6pfFNsZDiD4MDI5_!w?gBO6NHn7RuBt&Pje&FO=Q#%it-2$UdM2=TZO8MVY@diV>IeCZQEFRK2+ zZ*vmASvxo+IhK%AMB`Uz3@gxQn+N(PZ@kZ2I9PEsa@4f{sTdw8EAEGMMiO8{dpnvn z6U{R1bw5GZ@q}4ziv8l;flb?H1F1=FK$KX8&F4toi2wmzxM7@pL!2ba_V1;T<`U14&>`b5WwRsNpdb#3fp8k6%u z9XmD=ck{>Y@AeD^7z@lF=k6{iwAAm)4uE5EbsCTv_M|b1h8O;?;pQrz+0~R0 z+O<3}${ev%^J0A>jaY)uv0ZB!=iN%6P0G4JpTzx$?_vMj6zywVTPDZ+ij9aXUV5@a zBDQCHq(Wvu3I^VkaWGbxbBi>hVGB-mqPaRGp_t7RL|!l1UAdnY$*5N_ z50=y$Id0Az-}0VX$G~nQ`Tslyyh8(yDxkE7H|h38GZqr1Wp9kBeN^6|A|2N``s116*;{QszAQ^E5P;0 z=JLhg3eE*X;Kt*44n7M$0uF#p;8d^*RKU-X(LW31bN>=>0Gt8V1I6^e6ZjnDorig!Ifi{cMEyv|x8$wDA1x`p1H=~wlg(B_f7gZ9{oO;>;Xz)Wm(@~|r1 z^)(N@_^Qq)abl=~xp!>hQY>%7mxPnkN|cYZtta~QXo+XRCC=PE&*2nH8vc`%pr<0( zlu#vl;S|>rrR+dV9P=PJC|A12r}_y!@Oz?E&sF8q<2_N~yNubF0-$2ZVadLaxutvO zXImXaSML`(mTRSycW%8l8`F?2qLF$0YhAO@m|SQqN&pY%QSny0tU23qO0NtqRM^)c zx_77^jsp#e1ZIvgou0&aF7lXMMr9oT|YYL{3qkzTW1GQGR>WN(P*aO%sqhxT>T2B3ssrt5+T zcT&`8N%btH6NDnEYmn9VO^hq3lQ?1?E)W5 zG@o|A6|xB_BZOTXW3h>(9S+)Q!rhU%&u{B0W2PmW;ap&ERJ~)_avGCJFXc_6qeP_O z$`O1h=jXVMObP%+UO>g?ZeDYo>z4}>Dww9YLo_n{i8UCtwog+#hU3$+VW;}TLd;KH z#k4Y&lwVnb2+p1a*&J~nGdMzX1a$|FHUii-*IsN(STr=wejXo!vPBj*0 zkFu?4Ni%f3*UsI!n?}2kYrP)N7Avv=b#s3TkI9Zhw3m|4lF*yQY?ySbTVrACEnD~# zk)YYu!gM6-fgdm_QDg?1I5|%YIrRq|+iZ}*lZsnYVK?j==0dHS`;8u}xdFf(L-?&7%sdcVLxt5@7xvDnfW8Kx7;?O$NEK~9HU`yQf zQ9;(7gp_D4xFDXwh1TS;9a~!DC2! z0?+Rz$U(COxfhWg!*@cRPOyQPGHfpvQ}?FwJ7xQ%R4|s&qc+jdY8_*XI=^THco_I1x&Yk+sJjBBBaqF157-S>flr|ukl+5(KpoV;R&Xx3 z2lz00g69C8^WOk|j*j3x;HBWNz(wG(U<)`K+zWgSox!U>ygT55{C-#P+u&++1%C#l zJ9q%NKlo2{1785I24i3i_%VI2{(m00@phV%=_-Ts6;qS0S0rWW+`FtZ=?G?_&-wTx z+LzDGUn>x)U^!&YI<&1n;TXqp=~i?P^9*&>S!UC8*8y`@(Q7p^=hcI@Q4B)B2dVI& z?Nft|1LIi_flPT}zc3Lh@~M57>f2}^#o9tGZ()^kFd6;r5?f*DigbVVccE2*w^1MN znVxNqv#~}iwfBTLtE4vyyLx|om5zQs<$n4r%6tRkw+1mYxa8OEBJO5JYC0&=xyTI{(l&pNB(9yLrH6LGyh;f;7EF!;80H}jQb)U%m~e+5pxxhd?*Od8%Li}NU zz_=P;Bc>^~tNgkx5{yG!t)lDq8h$kF@YhRbj&)l$U6)M8Pko?VN%cFcj&K=e@;f%G z$(QKS+8)>eq>z(%bns9~e_^6>$DnRrl}>M5nqt6N*;r$xuIS>Dn58Am26Jm|A|i0a zF>y{!91F6E3x@#u^zK@6uT5_bZCAI;R?!?ii%L|y0jnbQQfr6^Iy;AVspS71lKGKC zCI62F`Tt$W`B#G&1Mn&Q{&=7m0LKBH|Gy6WGk70(0hj~3!O7qr;B(0T9|NxfSAsck z0XPjj0vr$S4iqEsI&=XCz(wF35OoCi=l5@*3wREAHdqI4K>q&<_#5zQ@G5XAI1#J| zIv4PIa3z=qQ(zK27`y{Lz*XQfun2TM;Gy6n$p0?@e+ixfP5^fXcLIt5_(t$z@I3HD zPzK*e{{Ia4H}G=sGVs^nF(94^cr|(e=?7j69u0ns4&YjF4R{mS15N@bf`!`S;0&zo zaP9IMNaY&B4zp|fe0CaA4X0J?h0M-xZ`CrVQ%qB?Yp@nv6(2Mf#wK{;j!l{aT_xok z1GZK5iysfa1v$T79br{fb3s`K(J>SnxfjNYRr=6Cw4mxmi|pA}QatsB=P3j}g0VQOC1D%P;C=Y8~x$ot(*# z?3u=0sDw2eyXWo|4jrdZIyWG(c#yj*l`OZD661mEXd_w3p z9XTsV+_I6fq>!@$Povs%<=cqQG zV$vvoLx`ALhHHx7%EE zqeJp?uDRy@a9~{C54d;XvWOEJ21{L=V&X^85i6NlzLYTa2vAZKI)0_9EN3w|kC_Ot zbSgPNGF~fPGKQ_RbUE*qF6qpMs+8_^WA!A0wSu*lZ1g)#OsRz_STGZ%8wXZ;S&B0r z8Iu1WiD-Fsu>W5aL_u+)Fszf z&bx(RZmf1oXVp71IRxjy*us)_pZhK?=lUPc?W<81P0IYR(T}ScS(QtoLO&A43Y}